|
| 1 | +//===- softmax.cc --------------------------------------------*- C++ |
| 2 | +//-*-===// |
| 3 | +// |
| 4 | +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 5 | +// See https://llvm.org/LICENSE.txt for license information. |
| 6 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 | +// |
| 8 | +// Copyright (C) 2025, Advanced Micro Devices, Inc. |
| 9 | +// |
| 10 | +//===-------------------------------------------------- --------===// |
| 11 | + |
| 12 | +#include <aie_api/aie.hpp> |
| 13 | +#include <stdint.h> |
| 14 | + |
| 15 | +#define SM_VEC_LEN 16 // 32 |
| 16 | +#define log2e 1.4453125 // 1.44269504089 |
| 17 | + |
| 18 | +using namespace aie; |
| 19 | + |
| 20 | +void softmax_simple_bf16(bfloat16 *restrict input_vector, |
| 21 | + bfloat16 *restrict output_vector, |
| 22 | + const int32_t vector_size) { |
| 23 | + event0(); |
| 24 | + |
| 25 | + int num_elems = vector_size; |
| 26 | + float accum_exp_val; |
| 27 | + auto it_exp_in = aie::cbegin_vector<16>((bfloat16 *)input_vector); |
| 28 | + auto it_exp_out = aie::begin_vector<16>((bfloat16 *)output_vector); |
| 29 | + auto it_scale = aie::cbegin_restrict_vector<16>((bfloat16 *)output_vector); |
| 30 | + auto it_soft_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector); |
| 31 | + |
| 32 | + bfloat16 col_sum_inv; |
| 33 | + aie::vector<bfloat16, 16> in_elems, va; |
| 34 | + aie::accum<accfloat, 16> out_vals; |
| 35 | + int col_iters = num_elems >> 4; |
| 36 | + accum_exp_val = 0; |
| 37 | + |
| 38 | + ///////////////////// |
| 39 | + //// Compute exp //// |
| 40 | + ///////////////////// |
| 41 | + aie::vector<bfloat16, SM_VEC_LEN> exp_val; |
| 42 | + aie::vector<float, SM_VEC_LEN> input_fp32; |
| 43 | + aie::vector<bfloat16, SM_VEC_LEN> log2e_vec = |
| 44 | + aie::broadcast<bfloat16, SM_VEC_LEN>(log2e); |
| 45 | + |
| 46 | + const int elem_iters = num_elems / SM_VEC_LEN; |
| 47 | + aie::vector<bfloat16, SM_VEC_LEN> input_bf16; |
| 48 | + aie::accum<accfloat, SM_VEC_LEN> exp_val_accum; |
| 49 | + exp_val_accum = aie::zeros<accfloat, SM_VEC_LEN>(); |
| 50 | + for (int i = 0; i < elem_iters; i++) { |
| 51 | + input_bf16 = *it_exp_in++; |
| 52 | + aie::accum<accfloat, 16> exp_in; |
| 53 | + exp_in = aie::mul(input_bf16, log2e_vec); |
| 54 | + exp_val = aie::exp2<bfloat16>(exp_in.to_vector<float>()); |
| 55 | + exp_val_accum = add(exp_val_accum, exp_val); |
| 56 | + *it_exp_out++ = exp_val; |
| 57 | + } |
| 58 | + aie::vector<float, SM_VEC_LEN> reduce = exp_val_accum.to_vector<float>(); |
| 59 | + accum_exp_val = aie::reduce_add(reduce); |
| 60 | + ///////////////////// |
| 61 | + |
| 62 | + col_sum_inv = (bfloat16)aie::inv(accum_exp_val); |
| 63 | + for (int c = 0; c < col_iters; c++) { |
| 64 | + in_elems = *it_scale++; |
| 65 | + out_vals = aie::mul(in_elems, col_sum_inv); |
| 66 | + *it_soft_out++ = out_vals.to_vector<bfloat16>(); |
| 67 | + } |
| 68 | + |
| 69 | + event1(); |
| 70 | + |
| 71 | + return; |
| 72 | +} |
| 73 | + |
| 74 | +extern "C" { |
| 75 | + |
| 76 | +void softmax_bf16(bfloat16 *restrict input, bfloat16 *restrict output, |
| 77 | + const int32_t input_size) { |
| 78 | + softmax_simple_bf16(input, output, input_size); |
| 79 | +} |
| 80 | + |
| 81 | +} // extern "C" |
0 commit comments