Vectorize Softmax Phoenix and Strix (#2139)

endtaka-amd · github-actions[bot] · web-flow · commit 4aa4e3873175 · 2025-03-19T20:12:03.000Z
Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/aie_kernels/aie2/softmax.cc b/aie_kernels/aie2/softmax.cc
@@ -0,0 +1,75 @@
+//===- softmax.cc --------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2025, Advanced Micro Devices, Inc.
+//
+//===-------------------------------------------------- --------===//
+
+#include <aie_api/aie.hpp>
+#include <lut_based_ops.h>
+#include <stdint.h>
+
+using namespace aie;
+
+void softmax_simple_bf16(bfloat16 *restrict input_vector,
+                         bfloat16 *restrict output_vector,
+                         const int32_t vector_size) {
+  event0();
+
+  int num_elems = vector_size;
+  float accum_exp_val;
+  auto it_exp_in = aie::cbegin_vector<16>((bfloat16 *)input_vector);
+  auto it_exp_out = aie::begin_vector<16>((bfloat16 *)output_vector);
+  auto it_scale = aie::cbegin_restrict_vector<16>((bfloat16 *)output_vector);
+  auto it_soft_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+
+  bfloat16 col_sum_inv;
+  aie::vector<bfloat16, 16> in_elems, va;
+  aie::accum<accfloat, 16> out_vals;
+  int col_iters = num_elems >> 4;
+  accum_exp_val = 0;
+
+  /////////////////////
+  //// Compute exp ////
+  /////////////////////
+  aie::vector<bfloat16, 16> exp_val;
+  aie::vector<float, 16> input_fp32;
+
+  const int elem_iters = num_elems / 16;
+  aie::vector<bfloat16, 16> input_bf16;
+  aie::accum<accfloat, 16> exp_val_accum;
+  exp_val_accum = aie::zeros<accfloat, 16>();
+  for (int i = 0; i < elem_iters; i++) {
+    input_bf16 = *it_exp_in++;
+    exp_val = to_v16bfloat16(getExpBf16(input_bf16));
+    exp_val_accum = add(exp_val_accum, exp_val);
+    *it_exp_out++ = exp_val;
+  }
+  aie::vector<float, 16> reduce = exp_val_accum.to_vector<float>();
+  accum_exp_val = aie::reduce_add(reduce);
+  /////////////////////
+
+  col_sum_inv = (bfloat16)aie::inv(accum_exp_val);
+  for (int c = 0; c < col_iters; c++) {
+    in_elems = *it_scale++;
+    out_vals = aie::mul(in_elems, col_sum_inv);
+    *it_soft_out++ = out_vals.to_vector<bfloat16>();
+  }
+
+  event1();
+
+  return;
+}
+
+extern "C" {
+
+void softmax_bf16(bfloat16 *restrict input, bfloat16 *restrict output,
+                  const int32_t input_size) {
+  softmax_simple_bf16(input, output, input_size);
+}
+
+} // extern "C"
diff --git a/aie_kernels/aie2p/softmax.cc b/aie_kernels/aie2p/softmax.cc
@@ -0,0 +1,81 @@
+//===- softmax.cc --------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2025, Advanced Micro Devices, Inc.
+//
+//===-------------------------------------------------- --------===//
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+#define SM_VEC_LEN 16   // 32
+#define log2e 1.4453125 // 1.44269504089
+
+using namespace aie;
+
+void softmax_simple_bf16(bfloat16 *restrict input_vector,
+                         bfloat16 *restrict output_vector,
+                         const int32_t vector_size) {
+  event0();
+
+  int num_elems = vector_size;
+  float accum_exp_val;
+  auto it_exp_in = aie::cbegin_vector<16>((bfloat16 *)input_vector);
+  auto it_exp_out = aie::begin_vector<16>((bfloat16 *)output_vector);
+  auto it_scale = aie::cbegin_restrict_vector<16>((bfloat16 *)output_vector);
+  auto it_soft_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+
+  bfloat16 col_sum_inv;
+  aie::vector<bfloat16, 16> in_elems, va;
+  aie::accum<accfloat, 16> out_vals;
+  int col_iters = num_elems >> 4;
+  accum_exp_val = 0;
+
+  /////////////////////
+  //// Compute exp ////
+  /////////////////////
+  aie::vector<bfloat16, SM_VEC_LEN> exp_val;
+  aie::vector<float, SM_VEC_LEN> input_fp32;
+  aie::vector<bfloat16, SM_VEC_LEN> log2e_vec =
+      aie::broadcast<bfloat16, SM_VEC_LEN>(log2e);
+
+  const int elem_iters = num_elems / SM_VEC_LEN;
+  aie::vector<bfloat16, SM_VEC_LEN> input_bf16;
+  aie::accum<accfloat, SM_VEC_LEN> exp_val_accum;
+  exp_val_accum = aie::zeros<accfloat, SM_VEC_LEN>();
+  for (int i = 0; i < elem_iters; i++) {
+    input_bf16 = *it_exp_in++;
+    aie::accum<accfloat, 16> exp_in;
+    exp_in = aie::mul(input_bf16, log2e_vec);
+    exp_val = aie::exp2<bfloat16>(exp_in.to_vector<float>());
+    exp_val_accum = add(exp_val_accum, exp_val);
+    *it_exp_out++ = exp_val;
+  }
+  aie::vector<float, SM_VEC_LEN> reduce = exp_val_accum.to_vector<float>();
+  accum_exp_val = aie::reduce_add(reduce);
+  /////////////////////
+
+  col_sum_inv = (bfloat16)aie::inv(accum_exp_val);
+  for (int c = 0; c < col_iters; c++) {
+    in_elems = *it_scale++;
+    out_vals = aie::mul(in_elems, col_sum_inv);
+    *it_soft_out++ = out_vals.to_vector<bfloat16>();
+  }
+
+  event1();
+
+  return;
+}
+
+extern "C" {
+
+void softmax_bf16(bfloat16 *restrict input, bfloat16 *restrict output,
+                  const int32_t input_size) {
+  softmax_simple_bf16(input, output, input_size);
+}
+
+} // extern "C"
diff --git a/programming_examples/ml/softmax/Makefile b/programming_examples/ml/softmax/Makefile
@@ -12,9 +12,16 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 include ${srcdir}/../../makefile-common
 
-VPATH := ${srcdir}/../../../aie_kernels/aie2
+aie2_runtime_dir = ${AIEOPT_DIR}/aie_runtime_lib/AIE2
 
 device ?= $(if $(filter 1,$(NPU2)),npu2,npu)
+
+ifeq ($(device),npu2)
+VPATH :=${srcdir}/../../../aie_kernels/aie2p
+else 
+VPATH :=${srcdir}/../../../aie_kernels/aie2
+endif
+
 targetname = softmax
 trace_size = 8192
 
@@ -27,42 +34,33 @@ endif
 
 all: build/final.xclbin build/insts.txt
 
-build/dut.cc: ${srcdir}/bf16_softmax.mlir
-	mkdir -p ${@D}
-	cd ${@D} &&	aie-opt $< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aie2" -lower-affine | aie-translate -aie2=true --aievec-to-cpp -o ${@F}
-
-build/dut.o: build/dut.cc
 ifeq ($(device),npu)
-	cd ${@D} &&	${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c ${<F} -o ${@F}
-else ifeq ($(device),npu2)
-	cd ${@D} &&	${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I../../../../aie_runtime_lib/AIE2P -c ${<F} -o ${@F}
-else
-	echo "Device type not supported"
+build/lut_based_ops.o: ${aie2_runtime_dir}/lut_based_ops.cpp
+	mkdir -p ${@D}
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -c $< -o ${@F}
 endif
 
-build/lut_based_ops.o: ../../../aie_runtime_lib/AIE2/lut_based_ops.cpp
+	
+build/softmax.o: ${VPATH}/softmax.cc
 	mkdir -p ${@D}
 ifeq ($(device),npu)
-	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -c $(<:%=../%) -o ${@F}
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -I${aie2_runtime_dir} -c $< -o ${@F}
 else ifeq ($(device),npu2)
-	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I. -c $(<:%=../%) -o ${@F}
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -c $< -o ${@F}
 else
 	echo "Device type not supported"
 endif
 
-build/softmax.o: bf16_softmax.cc
-	mkdir -p ${@D}
 ifeq ($(device),npu)
-	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $< -o ${@F}
+build/kernels.a: build/softmax.o build/lut_based_ops.o
+	ar rvs $@ $+
 else ifeq ($(device),npu2)
-	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2P -c $< -o ${@F}
+build/kernels.a: build/softmax.o
+	ar rvs $@ $+
 else
 	echo "Device type not supported"
 endif
 
-build/kernels.a: build/softmax.o 
-	ar rvs $@ $+
-
 build/aie.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
 	python3 $< ${device} > $@
diff --git a/programming_examples/ml/softmax/bf16_softmax.mlir b/programming_examples/ml/softmax/bf16_softmax.mlir
diff --git a/programming_examples/ml/softmax/softmax.cc b/programming_examples/ml/softmax/softmax.cc
diff --git a/programming_examples/ml/softmax/test.cpp b/programming_examples/ml/softmax/test.cpp
@@ -93,7 +93,7 @@ int main(int argc, const char *argv[]) {
 
   size_t OUT_SIZE = INOUT1_SIZE + trace_size;
 
-  srand(time(NULL));
+  srand(42);
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =