From b0a269748b4d731a9527bc414eb1c74e93362728 Mon Sep 17 00:00:00 2001 From: Philip James-Roxby Date: Wed, 10 Apr 2024 12:52:26 -0600 Subject: [PATCH] ReLU with tracing (#1204) ReLU example with tracing Co-authored-by: pjr Co-authored-by: Joseph Melber --- aie_kernels/relu.cc | 41 +++ .../basic/relu/CMakeLists.txt | 68 ++++ programming_examples/basic/relu/Makefile | 54 +++ programming_examples/basic/relu/aie2.py | 209 +++++++++++ programming_examples/basic/relu/test.cpp | 326 ++++++++++++++++++ 5 files changed, 698 insertions(+) create mode 100644 aie_kernels/relu.cc create mode 100644 programming_examples/basic/relu/CMakeLists.txt create mode 100644 programming_examples/basic/relu/Makefile create mode 100644 programming_examples/basic/relu/aie2.py create mode 100644 programming_examples/basic/relu/test.cpp diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc new file mode 100644 index 0000000000..a2e87cffc4 --- /dev/null +++ b/aie_kernels/relu.cc @@ -0,0 +1,41 @@ +//===- scale.cc -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define __AIENGINE__ 2 +#define NOCPP +#define __AIEARCH__ 20 + +#include +#include +#include +#include + +#include + +void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) { + const int v_factor = 32; + v32bfloat16 zeroes = broadcast_zero_bfloat16(); + + event0(); + for (size_t i = 0; i < TILE_SIZE; i += v_factor) + chess_prepare_for_pipelining chess_loop_range(32, 32) { + v32bfloat16 input = *(v32bfloat16 *)(a + i); + v32bfloat16 output = max(input, zeroes); + *(v32bfloat16 *)(c + i) = output; + } + event1(); + return; +} + +extern "C" { + +void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); } + +} // extern "C" diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt new file mode 100644 index 0000000000..d9f511062f --- /dev/null +++ b/programming_examples/basic/relu/CMakeLists.txt @@ -0,0 +1,68 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/basic/relu/Makefile new file mode 100644 index 0000000000..f862aad403 --- /dev/null +++ b/programming_examples/basic/relu/Makefile @@ -0,0 +1,54 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../makefile-common + +targetname = testRelu + +all: build/final.xclbin build/insts.txt + +build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/final.xclbin: build/aie.mlir build/bf16_relu.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +run_g: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536 + + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt + +clean: clean_trace + rm -rf build _build ${targetname}.exe + diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/basic/relu/aie2.py new file mode 100644 index 0000000000..8204706127 --- /dev/null +++ b/programming_examples/basic/relu/aie2.py @@ -0,0 +1,209 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + + +def my_relu(): + + word_size_in = 2 + N = 65536 + N_in_bytes = N * word_size_in + + A_sz_in_i32s = N_in_bytes // 4 + C_sz_in_i32s = N_in_bytes // 4 + + enable_tracing = True + trace_size = 65536 + + # Tile sizes + n = 1024 + N_div_n = N // n + + n_cores = 2 + tiles = N_div_n // n_cores + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + memRef_ty = T.memref(n, T.bf16()) + + # Type used in the tile memory + memRef_A_ty = T.memref(n, T.bf16()) + memRef_C_ty = T.memref(n, T.bf16()) + + # Type used in the memory tile which aggregates across the 4 cores + memRef_A_MT_ty = T.memref(n * n_cores, T.bf16()) + memRef_C_MT_ty = T.memref(n * n_cores, T.bf16()) + + # AIE Core Function declarations + + bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty]) + + # Tile declarations + ShimTile = tile(0, 0) + + MemTile = tile(0, 1) + cores = [tile(0, 2 + i) for i in range(n_cores)] + + inA_fifo_names = [f"memA{i}" for i in range(n_cores)] + outC_fifo_names = [f"memC{i}" for i in range(n_cores)] + + inA_fifos = {} + outC_fifos = {} + + # AIE-array data movement with object fifos + # Input A + inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty) + for i in range(n_cores): + inA_fifos[inA_fifo_names[i]] = object_fifo( + inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty + ) + object_fifo_link(inA, inA_fifo_names) + + # Output C + for i in range(n_cores): + outC_fifos[outC_fifo_names[i]] = object_fifo( + outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty + ) + outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty) + object_fifo_link(outC_fifo_names[0:n_cores], outC) + + # Set up a circuit-switched flow from core to shim for tracing information + if enable_tracing: + flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) + + # Set up compute tiles + for i in range(n_cores): + # Compute tile i + @core(cores[i], "bf16_relu.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + for _ in for_(tiles): + elem_out = outC_fifos[outC_fifo_names[i]].acquire( + ObjectFifoPort.Produce, 1 + ) + elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + + call(bf16_relu, [elem_in_a, elem_out]) + + inA_fifos[inA_fifo_names[i]].release( + ObjectFifoPort.Consume, 1 + ) + outC_fifos[outC_fifo_names[i]].release( + ObjectFifoPort.Produce, 1 + ) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): + + # Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md + if enable_tracing: + # 0x340D0: Trace Control 0 + # 0xAABB---C + # AA <- Event to stop trace capture + # BB <- Event to start trace capture + # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution + # Configure so that "Event 1" (always true) causes tracing to start + ipu_write32( + column=0, + row=2, + address=0x340D0, + value=0x00010000, + ) + # 0x340D4: Trace Control 1 + ipu_write32( + column=0, + row=2, + address=0x340D4, + value=0x00000000, + ) + # 0x340E0: Trace Event Group 1 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=0, + row=2, + address=0x340E0, + value=0x00222100, + ) + # 0x340E4: Trace Event Group 2 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=0, + row=2, + address=0x340E4, + value=0x00000000, + ) + + ipu_write32( + column=0, + row=2, + address=0x3FF00, + value=0x00000121, + ) + + # Configure a buffer descriptor to write tracing information that has been routed into this shim tile + # out to host DDR memory + trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory + output_size = N_in_bytes + ipu_writebd_shimtile( + bd_id=trace_bd_id, + buffer_length=trace_size, + buffer_offset=output_size, + enable_packet=0, + out_of_order_id=0, + packet_id=0, + packet_type=0, + column=0, + column_num=1, + d0_size=0, + d0_stride=0, + d1_size=0, + d1_stride=0, + d2_stride=0, + ddr_id=1, + iteration_current=0, + iteration_size=0, + iteration_stride=0, + lock_acq_enable=0, + lock_acq_id=0, + lock_acq_val=0, + lock_rel_id=0, + lock_rel_val=0, + next_bd=0, + use_next_bd=0, + valid_bd=1, + ) + # Set start BD to our shim bd_Id (13) + ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) + + ipu_dma_memcpy_nd( + metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] + ) + ipu_dma_memcpy_nd( + metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_relu() diff --git a/programming_examples/basic/relu/test.cpp b/programming_examples/basic/relu/test.cpp new file mode 100644 index 0000000000..14bb24babe --- /dev/null +++ b/programming_examples/basic/relu/test.cpp @@ -0,0 +1,326 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr bool VERIFY = true; + +constexpr int IN_SIZE = 65536; +constexpr int OUT_SIZE = IN_SIZE; + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { + std::ofstream fout(path); + uint32_t *traceOut = (uint32_t *)traceOutPtr; + for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { + fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; + fout << std::endl; + } +} + +static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) { + // Random numbers should NOT be uniformly between 0 and 1, because that + // would make the matrix product AB always close to 1. + return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias); +} + +bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { + std::bfloat16_t diff = fabs(a - b); + if ((diff / 4.0) < 0.001) + return true; + else + return false; +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "trace_sz,t", po::value()->default_value(0), + "the depth of the trace buffer")( + "trace_file,f", po::value()->default_value("trace.txt"), + "the output trace path")("verbosity,v", + po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + int trace_size = vm["trace_sz"].as(); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + + auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size; + auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + std::bfloat16_t *bufA = bo_inA.map(); + std::vector AVec(IN_SIZE); + for (int i = 0; i < IN_SIZE; i++) + AVec[i] = random_bfloat16_t(4.0, 2.0); + memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + int sticky_errors = 0; + + unsigned num_iter = 2; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + + auto start = std::chrono::high_resolution_clock::now(); + + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + std::bfloat16_t *bufOut = bo_out.map(); + + int errors = 0; + + if (VERIFY) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + for (uint32_t i = 0; i < IN_SIZE; i++) { + std::bfloat16_t ref = 0.0; + if (AVec[i] > 0.0) + ref = AVec[i]; + if (!nearly_equal(*(bufOut + i), ref)) { + std::cout << "Error in " << i << " output " << *(bufOut + i) + << " != " << ref << " actual max(" << AVec[i] << ", 0.0" + << std::endl; + errors++; + sticky_errors++; + } else { + if (verbosity >= 2) + std::cout << "Correct " << i << " output " << *(bufOut + i) + << " == " << ref << std::endl; + } + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: vector-scalar results not verified." + << std::endl; + } + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + + if (trace_size > 0) { + write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size, + vm["trace_file"].as()); + } + + if (VERIFY) { + if (!errors) { + std::cout << iter << ": pass!\n"; + } else { + std::cout << iter << ": fail! " << errors << " errors\n"; + } + } + } + + std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us." + << std::endl; + std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; + std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; + + // Let's figure out how many cycles it takes a core to do a single e^x + // There are 4 cores, so the total number of e^x's it does is one quarter of + // the test size + + int per_core_calcs = IN_SIZE / 4; + float avg_npu_time = npu_time_total / num_iter; + float avg_npu_clocks = + avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS + float clocks_per_calc = avg_npu_clocks / per_core_calcs; + std::cout << "Clocks per calc " << clocks_per_calc << std::endl; + + // Lets benchmark the CPU + float cpu_time_total = 0; + float cpu_time_min = 9999999; + float cpu_time_max = 0; + for (unsigned iter = 0; iter < num_iter; iter++) { + + std::vector AVec(IN_SIZE); + std::vector ResVec(IN_SIZE); + for (int i = 0; i < IN_SIZE; i++) { + AVec[i] = random_bfloat16_t(4.0, 2.0); + } + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < IN_SIZE; i++) { + ResVec[i] = exp(AVec[i]); + } + auto stop = std::chrono::high_resolution_clock::now(); + float cpu_time = + std::chrono::duration_cast(stop - start) + .count(); + + cpu_time_total += cpu_time; + cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min; + cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max; + } + std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us." + << std::endl; + std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl; + std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl; + + if (VERIFY) { + if (!sticky_errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl << "FAIL." << std::endl << std::endl; + return 1; + } + } else { + std::cout << "Verification skipped, but I'm sure it worked. I trust in you" + << std::endl; + } + return 0; +}