diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc index 62a53c7479..a2e87cffc4 100644 --- a/aie_kernels/relu.cc +++ b/aie_kernels/relu.cc @@ -19,25 +19,21 @@ #include - -void relu(bfloat16 * restrict a, bfloat16 * restrict c, const int TILE_SIZE) { +void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) { const int v_factor = 32; v32bfloat16 zeroes = broadcast_zero_bfloat16(); event0(); for (size_t i = 0; i < TILE_SIZE; i += v_factor) - chess_prepare_for_pipelining - chess_loop_range(32, 32) - { - v32bfloat16 input = *(v32bfloat16 *)(a + i); - v32bfloat16 output = max(input, zeroes); - *(v32bfloat16 *)(c + i) = output; - } + chess_prepare_for_pipelining chess_loop_range(32, 32) { + v32bfloat16 input = *(v32bfloat16 *)(a + i); + v32bfloat16 output = max(input, zeroes); + *(v32bfloat16 *)(c + i) = output; + } event1(); return; } - extern "C" { void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); } diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/basic/relu/aie2.py index f9553b57c3..8204706127 100644 --- a/programming_examples/basic/relu/aie2.py +++ b/programming_examples/basic/relu/aie2.py @@ -25,7 +25,6 @@ def my_relu(): enable_tracing = True trace_size = 65536 - # Tile sizes n = 1024 N_div_n = N // n @@ -50,9 +49,7 @@ def device_body(): # AIE Core Function declarations - bf16_relu = external_func( - "bf16_relu", inputs=[memRef_ty, memRef_ty] - ) + bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty]) # Tile declarations ShimTile = tile(0, 0) @@ -83,12 +80,10 @@ def device_body(): outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty) object_fifo_link(outC_fifo_names[0:n_cores], outC) - # Set up a circuit-switched flow from core to shim for tracing information if enable_tracing: flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) - # Set up compute tiles for i in range(n_cores): # Compute tile i @@ -200,7 +195,6 @@ def sequence(A, C): # Set start BD to our shim bd_Id (13) ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) - ipu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] ) diff --git a/programming_examples/basic/relu/test.cpp b/programming_examples/basic/relu/test.cpp index 53622644bc..14bb24babe 100644 --- a/programming_examples/basic/relu/test.cpp +++ b/programming_examples/basic/relu/test.cpp @@ -55,11 +55,10 @@ void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { } } - static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) { // Random numbers should NOT be uniformly between 0 and 1, because that // would make the matrix product AB always close to 1. - return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX))-bias); + return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias); } bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { @@ -98,9 +97,9 @@ int main(int argc, const char *argv[]) { "trace_sz,t", po::value()->default_value(0), "the depth of the trace buffer")( "trace_file,f", po::value()->default_value("trace.txt"), - "the output trace path")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( + "the output trace path")("verbosity,v", + po::value()->default_value(0), + "the verbosity of the output")( "instr,i", po::value()->required(), "path of file containing userspace instructions to be sent to the LX6"); po::variables_map vm; @@ -180,7 +179,6 @@ int main(int argc, const char *argv[]) { auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -254,8 +252,8 @@ int main(int argc, const char *argv[]) { npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; if (trace_size > 0) { - write_out_trace(((char *)bufOut) + (OUT_SIZE*2), trace_size, - vm["trace_file"].as()); + write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size, + vm["trace_file"].as()); } if (VERIFY) {