From df94633ed29f987ea0ab71d66db451660092a392 Mon Sep 17 00:00:00 2001 From: Lina Yu <108146828+linay-xsj@users.noreply.github.com> Date: Tue, 12 Sep 2023 12:45:49 -0700 Subject: [PATCH] Add softmax test cases (#635) --- aie_runtime_lib/AIE2/liblut_based_ops.a | Bin 23370 -> 24170 bytes .../AIEVecToCpp/TranslateAIEVecToCpp.cpp | 15 ++- .../bf16_softmax/bf16_softmax.mlir | 31 +++++ .../aievec_tests/bf16_softmax/defines.h | 3 + .../aievec_tests/bf16_softmax/dut.cc | 76 ++++++++++++ .../aievec_tests/bf16_softmax/testbench.cc | 62 ++++++++++ .../bf16_softmax_2/bf16_softmax.mlir | 40 +++++++ .../aievec_tests/bf16_softmax_2/defines.h | 3 + .../aievec_tests/bf16_softmax_2/dut.cc | 108 ++++++++++++++++++ .../aievec_tests/bf16_softmax_2/testbench.cc | 65 +++++++++++ 10 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir create mode 100644 test/unit_tests/aievec_tests/bf16_softmax/defines.h create mode 100644 test/unit_tests/aievec_tests/bf16_softmax/dut.cc create mode 100644 test/unit_tests/aievec_tests/bf16_softmax/testbench.cc create mode 100644 test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir create mode 100644 test/unit_tests/aievec_tests/bf16_softmax_2/defines.h create mode 100644 test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc create mode 100644 test/unit_tests/aievec_tests/bf16_softmax_2/testbench.cc diff --git a/aie_runtime_lib/AIE2/liblut_based_ops.a b/aie_runtime_lib/AIE2/liblut_based_ops.a index 869616f86f56c15d63694865ad3f56572fbd17ea..7edede56cbec0730be248c06d206cde3264a6589 100644 GIT binary patch delta 3694 zcmds4{cl@W89w*M*LIR?C(XS{>^T02ojP{DY+u`noi44Nk9F)i2y);Pvhn|_A9i(y5 z$HnQQSkK4%>3*?}j}KsCOrOg1A=AP4tHO81FY)m{`hjTT`dJ)+SWoQ+6AylXJop_3 zANo-kHcU3e`+7sf6>FYL&K*k4F753Z*p^M~o}S+qrx#!zv^-((aeZt6vtLwaziG%~ zr(vTB?r_LxMjs4l*k}eX65vGwA#@uy9z_2Q=D)7`VL*Su{78Tw355Lvxf{kgopnRS z+N!PtxbxmIoi^3xM(Uod*6q^3f|D`&j(b=i^@kJw?vS6Z^SbE`PdQDrm}rv(Mx~T;wqg|MLizpf09pmT1!dWe=YolMr@UuU&?pgGBk>vqX|U`cGDNV zO?02fK(Bbag>a|8Gn|uK+qv4aIzpPrV|qfKg+ILb_w{tFt!c+?81oikJ5c`wSZ3Dp z<%9^lP7g`1;~OFQLF6_&gfceKlV=X+nNXhjL7s`x-?mwow_*vdd>m5w6oiUHaF}NR zXfy+r8OWsa2t9+SE;DJLgrwyJ{EWwtX%LQrxQfuC zJqbzB3umG43&>c7=RkY~p-uY}NIIUi_$4xS;dKz-Lg-l5UO`pQI^9Cb#Ww2>!e-V@ zTM2MI>n0;~vo=nI9@fT>(97y?LfFdc4H*SR3ndm%W?H4r5|XjOKUSV1!Y*@j^d zR+dQBC_AB&%+=FVijA|=a}eG1s$yQ>fua}pkuz3fZQ-z*j|!W)aB`EDDO(T%&E0$rP z-4BvRFC9ilx@ejlL4Q52w?kS+y{}kTcj7CkJY1v@>1iZKO2`vPPOg!(va6!)TSz{% zM&iOsCy^X0A)lw8gw1U)qx3+DltXfF3Hd6Gb;;H@K&ojA!D+{fZ9UOm1X9pQm+Ag4 zFZVY6PM2-(6`*t)%X?@K< zWF?Y@Hg%haL?e{liC(D&;qH7xc|Dv*V)H?C^a8Rw%&6mkl)ES2VMAiq8qwHHzuMgl zkIxUgP2;T~%66v@k`zKbRH%FgE7u>x%F+guhKnVs56ST&k}a2uNL#5j(!f1JBN2P| zB-(ZrZ0woDX9;Y(QA!r1!ovFr0Yg%>uFK(pN;gP9&6+lPW*b|aIu<=A0qjEwm~X!2Z_}+ zLgMIiJ#ve-3TRsBs(KAbev^6G7nCH^YdvPJfqv9uTXv$@Uznj9yhsidk?JY8_)!^T zb)ldog2VH9qZGs8=kvq-PT!C0_lnrNOTVpZ6G#(95^U@!PI#rjG9XPMnJbX$jZWwO zu;D7VJk|lPcuHqL=@{KS+MxvfU5espcNAz>FqxcD!lCf)8JDWIJ7&_;^LrczlbLL4 zVcrp7FJlgm?|?76XF8i&^raSOeVOFqLN>LukVz%8zI6KFTmo1}a9RmRm2eW?Dx=UU z6sT8br)T^S!pYD~WF{P#PIfCwGLn06W1Ocev0%-C`D|*>d~(*2UVx?;a__{h^H!2w z$}BCU)pQ_DihQ|i{iptqKig3**S_uV8qP*7Bd6(GBc{(-CHm<|OAUD(zG;hBX5;qv zH2jeY_#x0rN5=2tzgA6G#+};N;ip2ij{bDq$=@{5YvXQVmzR)Ps+(}}7dvSCgquII zk&Z&L#9(e+%>*3cyYCVBd=QnUuTE48mKeORf=Q}f0)f3+qXRBijF1%l7-}9Hrj3(s u;fYZ~oOJV~i$6I=)01vYtB@8a={YsMt0r-$+q4M35%C?L$5MAM9RC}1tdxWR delta 3367 zcmd^?{ZCuh8OP7P_+A^h#)NCY#$X#eaR>(c+V~Z0k~XOVc_C{Gf$$m(9-*PYfYPjM z>$FW%sY|4gCzD2}ZP^rQT3SV^oOUCNwzk?PE$e<+*J)~|RqNigtlCmdQzI=^_dLhg zsp<9?>`M1M-}CuC=Q-E+oO^YB>$-5`HQ}sH-xk*5;ZS(R^*ooMSa{`%MF_0?%%(CX zuzSZqKk8RL+FK6KC_cEO)WUXSEzBC9hV}{zoW%3hia{7OS>Pg`@0f-mQ)z{_@cg85 z6kgh7g*%(9S#MRJKKruuRHc4Y$C$_-f_U=*Lo65?3k0+PoNe9>JG{!Lfd^JwYGIq# z{b}fjXT43u;C-)F47yvY|68s8`k*g3Ha)xK4{AY@-95cyICp3)h6gCEjY1H7tvjWl zPxA$`%dIU!^)K{{HL_C%#@6tpLo}4a_k4~0=2Fx)=^;dQC`+ue3xg!;n=lykVgiGP zrm)*7mGFUGd2&2Ydh_JFd2$5&{)WuMR6-~{j-m7kY|0+P^>`MAPPeQ497b0DHEab| z_7oZy$&lj8RWg({GL$!Y`*Unda!K!?)JgX#f56C;zjFKU*p@2)#@3)1%CM+XgKe4O z*@8h){MZ_m7`CF)g>4Bl_G4fb^6~vR?FfA6Z!k|Fk#zDD2EDFy2}9-atV_$5WS&56 zbW3!Fr%3vwA&IdwC@U;g(iG}d)Fl@A0tQ)-=r_Jhq)s}C=2en5`8o#4Aiaw7-XPK- zeGAQBlC;Y=G3Z4;#VsNZ>4#|klcX~v-zC?;XHv@%x%px_Np9w|`ABZzvn5D+_%wS+ zHt}f&NN(fxr${#Q`pYD}yux!NeO#`P^mF+Ql2aNl{1WK^FY$enZCw6@WQfbRNrrjh z-;<2+68|6>=LvU6wsR^fD3hF)lk8Nh6=)A@V!Z-0;RbQD0$&c>1ut9*+XM}M8n)YF zm}hb+xQ*#BTTz(p)Kj;^Cz!h%>=C=r1IdV0=!5=%)BSBqN6jY7vQy+!}c9YyT7P{nVt!zUWVH-^I(?zgGD}5uMr$9g6qHFN961) za+vyKf_)p{4H%7Ehkr#@*9Pk*!J`}C`v|K%{L=91)>rH=1ic~Lt90a``~gBqr~V7x zj&Bq0!pCuYMpr>E3R?|^s!+a0y^XqIK9b%SOl?Gir~B`KvH*p|!UAv31e}4re#{^oD1-m2mbDIC2lv z_rk6A&4L3=9hQMXG|djz7zTBcbWfr3D^$7mG*woQk#(ThQWppg6hVFu-AsKPG97io z)9_-4LwEtMb=1|LqvUiUx$snB@+#c!a2@zICRA5!rU&X0O7%Jo7%B{)tKrqnw9Ro> zkp20BU8uKizXjt7clg(obnjjj9@h1w_sIT2p$7lK>YrdOVVD0#)-l+TG;1G{oGoPW znR2Ngk@JIslY@7mF=-VA6C{!r(O`l@Nw=Q1gj!*O6UnVPU2THP$%c&0gaytNHXtuh z-$?KxU#}c!B5?j5Py+oNv=I*s_k8tAmw93;N8 zFgllf7*2OuGe^jJV0{R)9i{H;`Al`5x}*7Se*Kq;ezSeNWoEl%8p5Jk7VSM7~(!^pwYoDH<^e?36=N6}z<`$;Y zi~gCJ zaH-4s|N83}WWfjf`>y2f>BoJR|8Y-Y>!Gc6{FZ(ZU+$m1k4FyuStoXs!5g*~xKO2o z<)N6^XM`(5F8Kv~rfVPYX>Z0SWDpenySP2 zbkXZSf8;Q{J6tKPCGaa1lho=ZbomGMde$50tU~RGTO8a6i4l*~^&n$Gm_qRQK{!3) hp>P#LPe1%N7j&b%*D~s{^x&Tx`mz`Eb7G|ae*pdBTV4PF diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp index a56f6feb4a..b3520e4c9c 100644 --- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp +++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp @@ -27,8 +27,10 @@ #include "mlir/Support/IndentedOstream.h" #include "mlir/Support/MathExtras.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -213,6 +215,8 @@ struct CppEmitter { /// names of values in a scope. std::stack valueInScopeCount; std::stack labelInScopeCount; + + llvm::SmallSet includeNames; }; } // namespace @@ -2867,9 +2871,16 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) { LogicalResult status = llvm::TypeSwitch(&op) // EmitC ops. - .Case( + .Case( [&](auto op) { return printOperation(*this, op); }) + .Case([&](auto op) { + StringRef name = op.getInclude(); + if (!includeNames.count(name)) { + includeNames.insert(name); + return printOperation(*this, op); + } + return success(); + }) // SCF ops. .Case( [&](auto op) { return printOperation(*this, op); }) diff --git a/test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir b/test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir new file mode 100644 index 0000000000..d4b5a38920 --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir @@ -0,0 +1,31 @@ +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED +module { + func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) { + %cst = arith.constant 1.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + affine.for %arg2 = 0 to 1024 { + %3 = affine.load %arg0[%arg2] : memref<1024xbf16> + %4 = math.exp %3 : bf16 + affine.store %4, %arg0[%arg2] : memref<1024xbf16> + } + %0 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst_0) -> (f32) { + %3 = affine.load %arg0[%arg2] : memref<1024xbf16> + %4 = arith.extf %3 : bf16 to f32 + %5 = arith.addf %arg3, %4 : f32 + affine.yield %5 : f32 + } + %1 = arith.divf %cst, %0 : f32 + %2 = arith.truncf %1 : f32 to bf16 + affine.for %arg2 = 0 to 1024 { + %3 = affine.load %arg0[%arg2] : memref<1024xbf16> + %4 = arith.mulf %3, %2 : bf16 + affine.store %4, %arg1[%arg2] : memref<1024xbf16> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/bf16_softmax/defines.h b/test/unit_tests/aievec_tests/bf16_softmax/defines.h new file mode 100644 index 0000000000..3c6fc96a69 --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax/defines.h @@ -0,0 +1,3 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/unit_tests/aievec_tests/bf16_softmax/dut.cc b/test/unit_tests/aievec_tests/bf16_softmax/dut.cc new file mode 100644 index 0000000000..8e32ea3461 --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax/dut.cc @@ -0,0 +1,76 @@ +// Cycle count: 3245 +#include "lut_based_ops.h" + +void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) { + int32_t v3 = 0; + int32_t v4 = 4; + int32_t v5 = 8; + int32_t v6 = 16; + int32_t v7 = 32; + v16float v8 = broadcast_zero_float(); + bfloat16 v9 = 0.0e+00; + size_t v10 = 0; + size_t v11 = 1024; + size_t v12 = 16; + for (size_t v13 = v10; v13 < v11; v13 += v12) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v14 = *(v16bfloat16 *)(v1 + v13); + v16accfloat v15 = getExpBf16(v14); + v16bfloat16 v16 = to_v16bfloat16(v15); + *(v16bfloat16 *)(v1 + v13) = v16; + } + size_t v17 = 0; + size_t v18 = 1024; + size_t v19 = 16; + v16float v20; + v16float v21 = v8; + for (size_t v22 = v17; v22 < v18; v22 += v19) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v23 = *(v16bfloat16 *)(v1 + v22); + v16accfloat v24 = ups_to_v16accfloat(v23); + v16accfloat v25 = v16accfloat(v21); + v16accfloat v26 = add(v24, v25); + v16float v27 = v16float(v26); + v21 = v27; + } + v20 = v21; + v16float v28 = shift_bytes(v20, v20, v7); + v16accfloat v29 = v16accfloat(v20); + v16accfloat v30 = v16accfloat(v28); + v16accfloat v31 = add(v29, v30); + v16float v32 = v16float(v31); + v16float v33 = shift_bytes(v32, v32, v6); + v16accfloat v34 = v16accfloat(v32); + v16accfloat v35 = v16accfloat(v33); + v16accfloat v36 = add(v34, v35); + v16float v37 = v16float(v36); + v16float v38 = shift_bytes(v37, v37, v5); + v16accfloat v39 = v16accfloat(v37); + v16accfloat v40 = v16accfloat(v38); + v16accfloat v41 = add(v39, v40); + v16float v42 = v16float(v41); + v16float v43 = shift_bytes(v42, v42, v4); + v16accfloat v44 = v16accfloat(v42); + v16accfloat v45 = v16accfloat(v43); + v16accfloat v46 = add(v44, v45); + v16float v47 = v16float(v46); + float v48 = extract_elem(v47, v3); + bfloat16 v49 = getInvBf16(v48); + v32bfloat16 v50 = broadcast_to_v32bfloat16(v49); + v16bfloat16 v51 = extract_v16bfloat16(v50, 0); + v32bfloat16 v52 = broadcast_to_v32bfloat16(v9); + v16bfloat16 v53 = extract_v16bfloat16(v52, 0); + v32bfloat16 v54 = concat(v51, v53); + size_t v55 = 0; + size_t v56 = 1024; + size_t v57 = 16; + for (size_t v58 = v55; v58 < v56; v58 += v57) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v59 = *(v16bfloat16 *)(v1 + v58); + v32bfloat16 v60 = concat(v59, v53); + v16accfloat v61 = mul_elem_16_2(v54, v60); + v16bfloat16 v62 = to_v16bfloat16(v61); + *(v16bfloat16 *)(v2 + v58) = v62; + } + return; +} diff --git a/test/unit_tests/aievec_tests/bf16_softmax/testbench.cc b/test/unit_tests/aievec_tests/bf16_softmax/testbench.cc new file mode 100644 index 0000000000..cd82b0c14c --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax/testbench.cc @@ -0,0 +1,62 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include + +void dut(bfloat16 *restrict in0, bfloat16 *restrict out0); +void dut_ref(bfloat16 *in0, bfloat16 *out0); + +alignas(32) bfloat16 g_in0[IN0_SIZE]; +alignas(32) bfloat16 g_out0[OUT0_SIZE]; +alignas(32) bfloat16 g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_bfloat16(-2, 0, 2); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +void dut_ref(bfloat16 *in0, bfloat16 *out0) { + float sum = 0.0f; + + for (unsigned k = 0; k < IN0_SIZE; ++k) { + float in = in0[k]; + float out = exp(in); + in0[k] = (bfloat16)out; + sum += in0[k]; + } + + bfloat16 sum_inv = (bfloat16)(1.0f / sum); + for (unsigned k = 0; k < IN0_SIZE; ++k) { + out0[k] = in0[k] * sum_inv; + } +} diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir b/test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir new file mode 100644 index 0000000000..465fe17b09 --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir @@ -0,0 +1,40 @@ +// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc +// RUN: mkdir -p data +// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout +// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s +// CHECK: TEST PASSED +module { + func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 1.000000e+00 : f32 + %cst_1 = arith.constant 0.000000e+00 : bf16 + %cst_2 = arith.constant dense<0xFF80> : vector<32xbf16> + %0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) { + %5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16> + %6 = arith.maxf %arg3, %5 : vector<32xbf16> + affine.yield %6 : vector<32xbf16> + } + %1 = vector.reduction , %0 : vector<32xbf16> into bf16 + affine.for %arg2 = 0 to 1024 { + %5 = affine.load %arg0[%arg2] : memref<1024xbf16> + %6 = arith.subf %5, %1 : bf16 + %7 = math.exp %6 : bf16 + affine.store %7, %arg0[%arg2] : memref<1024xbf16> + } + %2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) { + %5 = affine.load %arg0[%arg2] : memref<1024xbf16> + %6 = arith.extf %5 : bf16 to f32 + %7 = arith.addf %arg3, %6 : f32 + affine.yield %7 : f32 + } + %3 = arith.divf %cst_0, %2 : f32 + %4 = arith.truncf %3 : f32 to bf16 + affine.for %arg2 = 0 to 1024 { + %5 = affine.load %arg0[%arg2] : memref<1024xbf16> + %6 = arith.mulf %5, %4 : bf16 + affine.store %6, %arg1[%arg2] : memref<1024xbf16> + } + return + } +} diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/defines.h b/test/unit_tests/aievec_tests/bf16_softmax_2/defines.h new file mode 100644 index 0000000000..3c6fc96a69 --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax_2/defines.h @@ -0,0 +1,3 @@ +#pragma once +constexpr unsigned const IN0_SIZE = 1024; +constexpr unsigned const OUT0_SIZE = 1024; diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc b/test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc new file mode 100644 index 0000000000..57e2b5b9cf --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc @@ -0,0 +1,108 @@ +// Cycle count: 3712 +#include "lut_based_ops.h" + +void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) { + int32_t v3 = 0; + int32_t v4 = 2; + int32_t v5 = 4; + int32_t v6 = 8; + int32_t v7 = 16; + int32_t v8 = 32; + v16float v9 = broadcast_zero_float(); + bfloat16 v10 = 0.0e+00; + v32bfloat16 v11 = broadcast_to_v32bfloat16( + (bfloat16)-338953138925153547590470800371487866880.000000); + size_t v12 = 0; + size_t v13 = 1024; + size_t v14 = 32; + v32bfloat16 v15; + v32bfloat16 v16 = v11; + for (size_t v17 = v12; v17 < v13; v17 += v14) + chess_prepare_for_pipelining chess_loop_range(32, 32) { + v32bfloat16 v18 = *(v32bfloat16 *)(v1 + v17); + v32bfloat16 v19 = max(v16, v18); + v16 = v19; + } + v15 = v16; + v32bfloat16 v20 = shift_bytes(v15, v15, v8); + v32bfloat16 v21 = max(v15, v20); + v32bfloat16 v22 = shift_bytes(v21, v21, v7); + v32bfloat16 v23 = max(v21, v22); + v32bfloat16 v24 = shift_bytes(v23, v23, v6); + v32bfloat16 v25 = max(v23, v24); + v32bfloat16 v26 = shift_bytes(v25, v25, v5); + v32bfloat16 v27 = max(v25, v26); + v32bfloat16 v28 = shift_bytes(v27, v27, v4); + v32bfloat16 v29 = max(v27, v28); + bfloat16 v30 = extract_elem(v29, v3); + v32bfloat16 v31 = broadcast_to_v32bfloat16(v30); + v16bfloat16 v32 = extract_v16bfloat16(v31, 0); + v16accfloat v33 = ups_to_v16accfloat(v32); + size_t v34 = 0; + size_t v35 = 1024; + size_t v36 = 16; + for (size_t v37 = v34; v37 < v35; v37 += v36) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v38 = *(v16bfloat16 *)(v1 + v37); + v16accfloat v39 = ups_to_v16accfloat(v38); + v16accfloat v40 = sub(v39, v33); + v16bfloat16 v41 = to_v16bfloat16(v40); + v16accfloat v42 = getExpBf16(v41); + v16bfloat16 v43 = to_v16bfloat16(v42); + *(v16bfloat16 *)(v1 + v37) = v43; + } + size_t v44 = 0; + size_t v45 = 1024; + size_t v46 = 16; + v16float v47; + v16float v48 = v9; + for (size_t v49 = v44; v49 < v45; v49 += v46) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v50 = *(v16bfloat16 *)(v1 + v49); + v16accfloat v51 = ups_to_v16accfloat(v50); + v16accfloat v52 = v16accfloat(v48); + v16accfloat v53 = add(v51, v52); + v16float v54 = v16float(v53); + v48 = v54; + } + v47 = v48; + v16float v55 = shift_bytes(v47, v47, v8); + v16accfloat v56 = v16accfloat(v47); + v16accfloat v57 = v16accfloat(v55); + v16accfloat v58 = add(v56, v57); + v16float v59 = v16float(v58); + v16float v60 = shift_bytes(v59, v59, v7); + v16accfloat v61 = v16accfloat(v59); + v16accfloat v62 = v16accfloat(v60); + v16accfloat v63 = add(v61, v62); + v16float v64 = v16float(v63); + v16float v65 = shift_bytes(v64, v64, v6); + v16accfloat v66 = v16accfloat(v64); + v16accfloat v67 = v16accfloat(v65); + v16accfloat v68 = add(v66, v67); + v16float v69 = v16float(v68); + v16float v70 = shift_bytes(v69, v69, v5); + v16accfloat v71 = v16accfloat(v69); + v16accfloat v72 = v16accfloat(v70); + v16accfloat v73 = add(v71, v72); + v16float v74 = v16float(v73); + float v75 = extract_elem(v74, v3); + bfloat16 v76 = getInvBf16(v75); + v32bfloat16 v77 = broadcast_to_v32bfloat16(v76); + v16bfloat16 v78 = extract_v16bfloat16(v77, 0); + v32bfloat16 v79 = broadcast_to_v32bfloat16(v10); + v16bfloat16 v80 = extract_v16bfloat16(v79, 0); + v32bfloat16 v81 = concat(v78, v80); + size_t v82 = 0; + size_t v83 = 1024; + size_t v84 = 16; + for (size_t v85 = v82; v85 < v83; v85 += v84) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16bfloat16 v86 = *(v16bfloat16 *)(v1 + v85); + v32bfloat16 v87 = concat(v86, v80); + v16accfloat v88 = mul_elem_16_2(v81, v87); + v16bfloat16 v89 = to_v16bfloat16(v88); + *(v16bfloat16 *)(v2 + v85) = v89; + } + return; +} diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/testbench.cc b/test/unit_tests/aievec_tests/bf16_softmax_2/testbench.cc new file mode 100644 index 0000000000..00a115721f --- /dev/null +++ b/test/unit_tests/aievec_tests/bf16_softmax_2/testbench.cc @@ -0,0 +1,65 @@ +#include "../common/testbench.h" +#include "defines.h" +#include +#include +#include +#include + +void dut(bfloat16 *restrict in0, bfloat16 *restrict out0); +void dut_ref(bfloat16 *in0, bfloat16 *out0); + +alignas(32) bfloat16 g_in0[IN0_SIZE]; +alignas(32) bfloat16 g_out0[OUT0_SIZE]; +alignas(32) bfloat16 g_out0Ref[OUT0_SIZE]; + +int main(int argc, char *argv[]) { + std::string dataDir(TO_STR(DATA_DIR)); + srand(10); + std::generate(g_in0, g_in0 + IN0_SIZE, + [&]() { return random_bfloat16(-3, 1, 2); }); + + writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); + + chess_memory_fence(); + auto cyclesBegin = chess_cycle_count(); + dut(g_in0, g_out0); + auto cyclesEnd = chess_cycle_count(); + chess_memory_fence(); + + auto cycleCount = (int)(cyclesEnd - cyclesBegin); + reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); + + writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); + + dut_ref(g_in0, g_out0Ref); + writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); + + bool ok = true; + ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2); + + if (ok) + printf("TEST PASSED\n"); + else + printf("TEST FAILED\n"); + + return ok ? 0 : 1; +} + +void dut_ref(bfloat16 *in0, bfloat16 *out0) { + float sum = 0.0f; + bfloat16 maxx = bfloat16(-0x1.FEp+127f); + for (unsigned k = 0; k < IN0_SIZE; ++k) { + maxx = std::max(maxx, in0[k]); + } + for (unsigned k = 0; k < IN0_SIZE; ++k) { + float in = in0[k] - maxx; + float out = exp(in); + in0[k] = (bfloat16)out; + sum += in0[k]; + } + + bfloat16 sum_inv = (bfloat16)(1.0f / sum); + for (unsigned k = 0; k < IN0_SIZE; ++k) { + out0[k] = in0[k] * sum_inv; + } +}