graph: backend: dnnl: support int8 sdpa for softmax

Gu, Yonghao · TaoLv · commit 8c8eb439965f · 2025-04-19T11:15:54.000+08:00
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
@@ -77,11 +77,14 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops);
     BACKEND_DNNL_ADD_PASS(pipeline, insert_permute_for_matmul);
     if (quantized) {
+        BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
         BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_dst_scales);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_scales);
         BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_dst_zero_points);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_zero_points);
-        BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
+        // fuse those new post-binaries converted from add_zps and mul_scales
+        BACKEND_DNNL_ADD_PASS(pipeline, replace_quant_data_with_binary_post_op);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops);
     }
     pipeline.reset_visualize_arg(true, false);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.hpp b/src/graph/backend/dnnl/kernels/sdp_decomp.hpp
@@ -118,8 +118,7 @@ struct sdp_decomp_kernel_t : public kernel_base_t {
                             mem_map[ori_mem.get()][tid]
                                     = memory(ori_mem.get_desc(),
                                             ori_mem.get_engine(), nullptr);
-                            if (iter.first >= DNNL_ARG_ATTR_SCALES
-                                    && iter.first <= DNNL_ARG_ATTR_POST_OP_DW) {
+                            if (iter.first >= DNNL_ARG_ATTR_SCALES) {
                                 mem_map[ori_mem.get()][tid].set_data_handle(
                                         ori_mem.get_data_handle());
                             }
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp
@@ -135,7 +135,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
             sub_mm1_wei_md, sub_mm1_dst_md, sub_softmax_dst_md,
             sub_wei2_user_md, sub_mm2_wei_md, sub_mm2_dst_md, sub_dst_md,
             sub_dst_user_md, sub_select_cond_md, sub_select_src0_md;
-    std::vector<memory::desc> sub_mm1_post_md;
+    std::vector<memory::desc> sub_mm1_post_md, sub_softmax_post_md;
 
     // must use user mode to support concurrent execution
     primitive_attr sub_reorder0_attr;
@@ -229,6 +229,25 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // softmax
     // create softmax primitive attr
     dnnl::primitive_attr sub_softmax_attr = make_primitive_attr(sdp_op[2], mgr);
+
+    dnnl_pops = {};
+    ori_dnnl_pops = sub_softmax_attr.get_post_ops();
+    for (int i = 0; i < ori_dnnl_pops.get()->len(); i++) {
+        const auto alg = static_cast<algorithm>(
+                ori_dnnl_pops.get()->entry_[i].binary.alg);
+        const dnnl::impl::memory_desc_t &ori_desc
+                = ori_dnnl_pops.get()->entry_[i].binary.user_src1_desc;
+        auto post_shape = ori_desc.dims;
+        auto post_stride = ori_desc.format_desc.blocking.strides;
+        auto post_dt = static_cast<memory::data_type>(ori_desc.data_type);
+        dims post_stride_dims = dims(post_stride, post_stride + ori_desc.ndims);
+        auto new_sub_md = memory::desc({1, 1, post_shape[2], post_shape[3]},
+                post_dt, post_stride_dims);
+        sub_softmax_post_md.emplace_back(new_sub_md);
+        dnnl_pops.append_binary(alg, new_sub_md);
+    }
+    sub_softmax_attr.set_post_ops(dnnl_pops);
+
     sub_softmax_dst_md = memory::desc(sub_mm1_dst_dims, dt_src_user, tag::abcd);
     const auto mode = sdp_op[2]->get_attr<std::string>(op_attr::mode);
     const dnnl::algorithm algo = mode == "inf_as_zero"
@@ -337,6 +356,23 @@ impl::status_t sdp_decomp_config_t::construct_params(
     }
     // softmax
     sub_softmax_dst = memory(sub_softmax_dst_md, p_engine, nullptr);
+    for (int i = 0; i < (int)sub_softmax_post_md.size(); i++) {
+        sub_softmax_post_mem.emplace_back(sub_softmax_post_md[i], p_engine);
+        auto alg = static_cast<algorithm>(
+                ori_dnnl_pops.get()->entry_[i].binary.alg);
+        if (alg == dnnl::algorithm::binary_mul) {
+            float *ptr = reinterpret_cast<float *>(
+                    sub_softmax_post_mem[i].get_data_handle());
+            ptr[0] = get_attr_value<float, float>(
+                    sdp_op[2], i + 1, op_attr::scales);
+        }
+        if (alg == dnnl::algorithm::binary_add) {
+            int *ptr = reinterpret_cast<int *>(
+                    sub_softmax_post_mem[i].get_data_handle());
+            ptr[0] = get_attr_value<int64_t, int32_t>(
+                    sdp_op[2], i + 1, op_attr::zps);
+        }
+    }
     // reorder2
     sub_wei2_user = memory(sub_wei2_user_md, p_engine, nullptr);
     // mm2
@@ -372,6 +408,12 @@ impl::status_t sdp_decomp_config_t::construct_params(
                     {DNNL_ARG_DST, sub_softmax_dst},
                     {DNNL_ARG_SCRATCHPAD, sub_scratchpad}};
 
+    for (int i = 0; i < (int)sub_softmax_post_mem.size(); i++) {
+        sub_softmax_args.insert(
+                {DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1,
+                        sub_softmax_post_mem[i]});
+    }
+
     sub_reorder2_args = {{DNNL_ARG_SRC, sub_wei2_user},
             {DNNL_ARG_DST, sub_mm2_wei}, {DNNL_ARG_SCRATCHPAD, sub_scratchpad}};
 
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp b/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp
@@ -109,7 +109,7 @@ struct sdp_decomp_config_t {
     //mm1
     memory sub_mm1_src, sub_mm1_wei, sub_mm1_dst;
     // sub_mm1_post_mem contains [post_scale, attn_mask(optional)]
-    std::vector<memory> sub_mm1_post_mem;
+    std::vector<memory> sub_mm1_post_mem, sub_softmax_post_mem;
     //select binary
     memory sub_select_cond, sub_select_src0, sub_select_dst;
     //softmax
diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp
@@ -1207,6 +1207,17 @@ status_t fuse_dst_scales(std::shared_ptr<subgraph_t> &sg) {
         if (consumers.size() != 1) continue;
         auto &next_op = consumers[0].get_op();
         if (next_op.get_kind() != op_kind::dnnl_mul_scales) continue;
+        // For these three ops, the dst zps are not supported
+        if (impl::utils::one_of(cur_op->get_kind(), op_kind::dnnl_softmax,
+                    op_kind::dnnl_layernorm, op_kind::dnnl_groupnorm)) {
+            out_val = next_op.get_output_value(0);
+            consumers = out_val->get_consumers();
+            if (consumers.size() == 1) {
+                auto &next2_op = consumers[0].get_op();
+                if (next2_op.get_kind() == op_kind::dnnl_add_zps) continue;
+            }
+        }
+
         fuse_groups.emplace_back(cur_op.get(), &next_op);
         visited.insert(cur_op.get());
         visited.insert(&next_op);
@@ -1249,6 +1260,17 @@ status_t convert_to_runtime_dst_scales(std::shared_ptr<subgraph_t> &sg) {
                 || visited.count(cur_op.get()))
             continue;
 
+        if (impl::utils::one_of(cur_op->get_input_op(0)->get_kind(),
+                    op_kind::dnnl_softmax, op_kind::dnnl_layernorm,
+                    op_kind::dnnl_groupnorm)) {
+            auto out_val = cur_op->get_output_value(0);
+            auto consumers = out_val->get_consumers();
+            if (consumers.size() == 1) {
+                auto &next_op = consumers[0].get_op();
+                if (next_op.get_kind() == op_kind::dnnl_add_zps) continue;
+            }
+        }
+
         // This pass only handle static quantization
         bool dync_quantization = cur_op->has_attr(op_attr::with_runtime_scales)
                 && cur_op->get_attr<bool>(op_attr::with_runtime_scales);