diff --git a/aie_kernels/relu.cc b/aie_kernels/aie2/relu.cc similarity index 100% rename from aie_kernels/relu.cc rename to aie_kernels/aie2/relu.cc diff --git a/programming_examples/basic/relu/test.cpp b/programming_examples/basic/relu/test.cpp deleted file mode 100644 index 14bb24babe..0000000000 --- a/programming_examples/basic/relu/test.cpp +++ /dev/null @@ -1,326 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -constexpr bool VERIFY = true; - -constexpr int IN_SIZE = 65536; -constexpr int OUT_SIZE = IN_SIZE; - -namespace po = boost::program_options; - -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} - -void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { - std::ofstream fout(path); - uint32_t *traceOut = (uint32_t *)traceOutPtr; - for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { - fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; - fout << std::endl; - } -} - -static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) { - // Random numbers should NOT be uniformly between 0 and 1, because that - // would make the matrix product AB always close to 1. - return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias); -} - -bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { - std::bfloat16_t diff = fabs(a - b); - if ((diff / 4.0) < 0.001) - return true; - else - return false; -} - -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "trace_sz,t", po::value()->default_value(0), - "the depth of the trace buffer")( - "trace_file,f", po::value()->default_value("trace.txt"), - "the output trace path")("verbosity,v", - po::value()->default_value(0), - "the verbosity of the output")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); - po::variables_map vm; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - int trace_size = vm["trace_sz"].as(); - - // Start the XRT test code - // Get a device handle - unsigned int device_index = 0; - auto device = xrt::device(device_index); - - // Load the xclbin - if (verbosity >= 1) - std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; - auto xclbin = xrt::xclbin(vm["xclbin"].as()); - - if (verbosity >= 1) - std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; - std::string Node = vm["kernel"].as(); - - // Get the kernel from the xclbin - auto xkernels = xclbin.get_kernels(); - auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { - auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; - return name.rfind(Node, 0) == 0; - }); - auto kernelName = xkernel.get_name(); - - if (verbosity >= 1) - std::cout << "Registering xclbin: " << vm["xclbin"].as() - << "\n"; - - device.register_xclbin(xclbin); - - // get a hardware context - if (verbosity >= 1) - std::cout << "Getting hardware context.\n"; - xrt::hw_context context(device, xclbin.get_uuid()); - - // get a kernel handle - if (verbosity >= 1) - std::cout << "Getting handle to kernel:" << kernelName << "\n"; - auto kernel = xrt::kernel(context, kernelName); - - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - - auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size; - auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY, - kernel.group_id(3)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - std::bfloat16_t *bufA = bo_inA.map(); - std::vector AVec(IN_SIZE); - for (int i = 0; i < IN_SIZE; i++) - AVec[i] = random_bfloat16_t(4.0, 2.0); - memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t))); - - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - int sticky_errors = 0; - - unsigned num_iter = 2; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - for (unsigned iter = 0; iter < num_iter; iter++) { - - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - - auto start = std::chrono::high_resolution_clock::now(); - - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - std::bfloat16_t *bufOut = bo_out.map(); - - int errors = 0; - - if (VERIFY) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - for (uint32_t i = 0; i < IN_SIZE; i++) { - std::bfloat16_t ref = 0.0; - if (AVec[i] > 0.0) - ref = AVec[i]; - if (!nearly_equal(*(bufOut + i), ref)) { - std::cout << "Error in " << i << " output " << *(bufOut + i) - << " != " << ref << " actual max(" << AVec[i] << ", 0.0" - << std::endl; - errors++; - sticky_errors++; - } else { - if (verbosity >= 2) - std::cout << "Correct " << i << " output " << *(bufOut + i) - << " == " << ref << std::endl; - } - } - } else { - if (verbosity >= 1) - std::cout << "WARNING: vector-scalar results not verified." - << std::endl; - } - - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - - if (trace_size > 0) { - write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size, - vm["trace_file"].as()); - } - - if (VERIFY) { - if (!errors) { - std::cout << iter << ": pass!\n"; - } else { - std::cout << iter << ": fail! " << errors << " errors\n"; - } - } - } - - std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; - std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; - - // Let's figure out how many cycles it takes a core to do a single e^x - // There are 4 cores, so the total number of e^x's it does is one quarter of - // the test size - - int per_core_calcs = IN_SIZE / 4; - float avg_npu_time = npu_time_total / num_iter; - float avg_npu_clocks = - avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS - float clocks_per_calc = avg_npu_clocks / per_core_calcs; - std::cout << "Clocks per calc " << clocks_per_calc << std::endl; - - // Lets benchmark the CPU - float cpu_time_total = 0; - float cpu_time_min = 9999999; - float cpu_time_max = 0; - for (unsigned iter = 0; iter < num_iter; iter++) { - - std::vector AVec(IN_SIZE); - std::vector ResVec(IN_SIZE); - for (int i = 0; i < IN_SIZE; i++) { - AVec[i] = random_bfloat16_t(4.0, 2.0); - } - auto start = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < IN_SIZE; i++) { - ResVec[i] = exp(AVec[i]); - } - auto stop = std::chrono::high_resolution_clock::now(); - float cpu_time = - std::chrono::duration_cast(stop - start) - .count(); - - cpu_time_total += cpu_time; - cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min; - cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max; - } - std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl; - std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl; - - if (VERIFY) { - if (!sticky_errors) { - std::cout << std::endl << "PASS!" << std::endl << std::endl; - return 0; - } else { - std::cout << std::endl << "FAIL." << std::endl << std::endl; - return 1; - } - } else { - std::cout << "Verification skipped, but I'm sure it worked. I trust in you" - << std::endl; - } - return 0; -} diff --git a/programming_examples/basic/vector_softmax/aie2.py.orig b/programming_examples/basic/vector_softmax/aie2.py.orig deleted file mode 100755 index bbdc8ab5c8..0000000000 --- a/programming_examples/basic/vector_softmax/aie2.py.orig +++ /dev/null @@ -1,121 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2023 AMD Inc. - -import sys - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx - - -def my_eltwise_add(): - - word_size_in = 2 - N = 65536 #*1024 - N_in_bytes = N * word_size_in - - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - - # Tile sizes - n = 1024 - N_div_n = N // n - - n_cores = 4 - tiles = N_div_n // n_cores - buffer_depth = 2 - - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.ipu) - def device_body(): - memRef_ty = T.memref(n, T.bf16()) - - # Type used in the tile memory - memRef_A_ty = T.memref(n, T.bf16()) - memRef_C_ty = T.memref(n, T.bf16()) - - # Type used in the memory tile which aggregates across the 4 cores - memRef_A_MT_ty = T.memref(n * n_cores, T.bf16()) - memRef_C_MT_ty = T.memref(n * n_cores, T.bf16()) - - # AIE Core Function declarations - - exp_bf16_vector = external_func("exp_bf16_vector", inputs=[memRef_ty, memRef_ty]) - - # Tile declarations - ShimTile = tile(0, 0) - - MemTile = tile(0, 1) - cores = [tile(0, 2 + i) for i in range(n_cores)] - - inA_fifo_names = [f"memA{i}" for i in range(n_cores)] - outC_fifo_names = [f"memC{i}" for i in range(n_cores)] - - inA_fifos = {} - outC_fifos = {} - - # AIE-array data movement with object fifos - # Input A - inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty) - for i in range(n_cores): - inA_fifos[inA_fifo_names[i]] = object_fifo( - inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty - ) - object_fifo_link(inA, inA_fifo_names) - - # Output C - for i in range(n_cores): - outC_fifos[outC_fifo_names[i]] = object_fifo( - outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty - ) - outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty) - object_fifo_link(outC_fifo_names[0:n_cores], outC) - - # Set up compute tiles - for i in range(n_cores): - # Compute tile i - @core(cores[i], "kernels.a") - def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): - elem_out = outC_fifos[outC_fifo_names[i]].acquire( - ObjectFifoPort.Produce, 1 - ) - elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( - ObjectFifoPort.Consume, 1 - ) - - call(exp_bf16_vector,[elem_in_a, elem_out]) - - inA_fifos[inA_fifo_names[i]].release( - ObjectFifoPort.Consume, 1 - ) - outC_fifos[outC_fifo_names[i]].release( - ObjectFifoPort.Produce, 1 - ) - yield_([]) - yield_([]) - - # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) - - @FuncOp.from_py_func(tensor_ty, tensor_ty) - def sequence(A, C): - ipu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - ipu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) - ipu_sync(column=0, row=0, direction=0, channel=0) - - print(ctx.module) - - -my_eltwise_add() diff --git a/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig b/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig deleted file mode 100644 index cd72bcd0d3..0000000000 --- a/programming_examples/basic/vector_softmax/bf16_softmax.mlir.orig +++ /dev/null @@ -1,34 +0,0 @@ -module { - func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 1.000000e+00 : f32 - %cst_1 = arith.constant 0.000000e+00 : bf16 - %cst_2 = arith.constant dense<0xFF80> : vector<32xbf16> - %0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) { - %5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16> - %6 = arith.maximumf %arg3, %5 : vector<32xbf16> - affine.yield %6 : vector<32xbf16> - } - %1 = vector.reduction , %0 : vector<32xbf16> into bf16 - affine.for %arg2 = 0 to 1024 { - %5 = affine.load %arg0[%arg2] : memref<1024xbf16> - %6 = arith.subf %5, %1 : bf16 - %7 = math.exp %6 : bf16 - affine.store %7, %arg0[%arg2] : memref<1024xbf16> - } - %2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) { - %5 = affine.load %arg0[%arg2] : memref<1024xbf16> - %6 = arith.extf %5 : bf16 to f32 - %7 = arith.addf %arg3, %6 : f32 - affine.yield %7 : f32 - } - %3 = arith.divf %cst_0, %2 : f32 - %4 = arith.truncf %3 : f32 to bf16 - affine.for %arg2 = 0 to 1024 { - %5 = affine.load %arg0[%arg2] : memref<1024xbf16> - %6 = arith.mulf %5, %4 : bf16 - affine.store %6, %arg1[%arg2] : memref<1024xbf16> - } - return - } -} \ No newline at end of file diff --git a/programming_examples/basic/vector_softmax/sweep.py b/programming_examples/basic/vector_softmax/sweep.py deleted file mode 100644 index fabf5e70da..0000000000 --- a/programming_examples/basic/vector_softmax/sweep.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -for action in ["rm -f", "touch"]: - cmd = f"{action} results.csv" - os.system(cmd) - - -for s in [16384, 32768, 65536, 131072, 262144]: - for i in [64, 128, 256, 512, 1024]: - for f in ["bf16_softmax.mlir", "test.cpp", "aie2.py"]: - sed = f"sed 's\\1024\\{i}\g' {f}.orig > {f}.first" - os.system(sed) - sed = f"sed 's\\65536\\{s}\g' {f}.first > {f}" - os.system(sed) - make_clean = f"make clean > /dev/null" - os.system(make_clean) - make_all = f"make all" - os.system(make_all) - make_profile = f"make profile" - os.system(make_profile) diff --git a/programming_examples/basic/vector_softmax/test.cpp b/programming_examples/basic/vector_softmax/test.cpp deleted file mode 100644 index 9354405139..0000000000 --- a/programming_examples/basic/vector_softmax/test.cpp +++ /dev/null @@ -1,320 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -constexpr bool VERIFY = true; - -constexpr int IN_SIZE = 262144; //*1024; -constexpr int TILE_SIZE = 1024; -constexpr int OUT_SIZE = IN_SIZE; - -namespace po = boost::program_options; - -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} - -static inline std::bfloat16_t random_bfloat16_t() { - // Random numbers should NOT be uniformly between 0 and 1, because that - // would make the matrix product AB always close to 1. - return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); -} - -bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { - std::bfloat16_t diff = fabs(a - b); - if ((diff / a) < 0.1) - return true; - else - return false; -} - -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "profile,p", po::value()->default_value(""), "CSV profile")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); - po::variables_map vm; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT test code - // Get a device handle - unsigned int device_index = 0; - auto device = xrt::device(device_index); - - // Load the xclbin - if (verbosity >= 1) - std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; - auto xclbin = xrt::xclbin(vm["xclbin"].as()); - - if (verbosity >= 1) - std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; - std::string Node = vm["kernel"].as(); - - // Get the kernel from the xclbin - auto xkernels = xclbin.get_kernels(); - auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { - auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; - return name.rfind(Node, 0) == 0; - }); - auto kernelName = xkernel.get_name(); - - if (verbosity >= 1) - std::cout << "Registering xclbin: " << vm["xclbin"].as() - << "\n"; - - device.register_xclbin(xclbin); - - // get a hardware context - if (verbosity >= 1) - std::cout << "Getting hardware context.\n"; - xrt::hw_context context(device, xclbin.get_uuid()); - - // get a kernel handle - if (verbosity >= 1) - std::cout << "Getting handle to kernel:" << kernelName << "\n"; - auto kernel = xrt::kernel(context, kernelName); - - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - std::bfloat16_t *bufA = bo_inA.map(); - std::vector AVec(IN_SIZE); - for (int i = 0; i < IN_SIZE; i++) - AVec[i] = random_bfloat16_t() / 8.0; - memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t))); - - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - int sticky_errors = 0; - - unsigned num_iter = 64; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - // Lets also benchmark the CPU - float cpu_time_total = 0; - float cpu_time_min = 9999999; - float cpu_time_max = 0; - - for (unsigned iter = 0; iter < num_iter; iter++) { - - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - - auto start = std::chrono::high_resolution_clock::now(); - - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - std::bfloat16_t *bufOut = bo_out.map(); - - int errors = 0; - - if (VERIFY) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - - std::vector RefVec(IN_SIZE); - auto cpu_start = std::chrono::high_resolution_clock::now(); - - for (uint32_t t = 0; t < IN_SIZE; t += TILE_SIZE) { - float running = 0.0; - for (uint32_t i = 0; i < TILE_SIZE; i++) { - float ez = (float)(exp(AVec[t + i])); - running += ez; - RefVec[t + i] = exp(AVec[t + i]); - } - - for (uint32_t i = 0; i < TILE_SIZE; i++) { - RefVec[t + i] /= running; - } - } - auto cpu_stop = std::chrono::high_resolution_clock::now(); - float cpu_time = std::chrono::duration_cast( - cpu_stop - cpu_start) - .count(); - - cpu_time_total += cpu_time; - cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min; - cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max; - - for (uint32_t i = 0; i < IN_SIZE; i++) { - std::bfloat16_t ref = RefVec[i]; - if (!nearly_equal(*(bufOut + i), ref)) { - std::cout << "Error in " << i << " output " << *(bufOut + i) - << " != " << ref << " actual e^" << AVec[i] << " : " - << exp(AVec[i]) << std::endl; - errors++; - sticky_errors++; - } else { - if (verbosity >= 2) - std::cout << "Correct " << i << " output " << *(bufOut + i) - << " == " << ref << std::endl; - } - } - - } else { - if (verbosity >= 1) - std::cout << "WARNING: vector-scalar results not verified." - << std::endl; - } - - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - - std::string profile = vm["profile"].as(); - if (profile.length()) { - std::ofstream of; - of.open(profile, std::ios::app); // Append - of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl; - } - - if (VERIFY) { - if (!errors) { - std::cout << iter << ": pass! in " << npu_time << "us" << std::endl; - } else { - std::cout << iter << ": fail! " << errors << " errors in " << npu_time - << "us" << std::endl; - } - } - } - - std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min NPU exec time: " << npu_time_min << "us." << std::endl; - std::cout << "Max NPU exec time: " << npu_time_max << "us." << std::endl; - - // Let's figure out how many cycles it takes a core to do a single e^x - // There are 4 cores, so the total number of e^x's it does is one quarter of - // the test size - - int per_core_calcs = IN_SIZE / 4; - float avg_npu_time = npu_time_total / num_iter; - float avg_npu_clocks = - avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS - float clocks_per_calc = avg_npu_clocks / per_core_calcs; - std::cout << "Clocks per calc " << clocks_per_calc << std::endl; - - std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min CPU exec time: " << cpu_time_min << "us." << std::endl; - std::cout << "Max CPU exec time: " << cpu_time_max << "us." << std::endl; - - if (VERIFY) { - if (!sticky_errors) { - std::cout << std::endl << "PASS!" << std::endl << std::endl; - return 0; - } else { - std::cout << std::endl << "FAIL." << std::endl << std::endl; - return 1; - } - } else { - std::cout << "Verification skipped, but I'm sure it worked. I trust in you" - << std::endl; - } - return 0; -} diff --git a/programming_examples/basic/vector_softmax/test.cpp.orig b/programming_examples/basic/vector_softmax/test.cpp.orig deleted file mode 100644 index 2fa314e946..0000000000 --- a/programming_examples/basic/vector_softmax/test.cpp.orig +++ /dev/null @@ -1,325 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -constexpr bool VERIFY = true; - -constexpr int IN_SIZE = 65536; //*1024; -constexpr int TILE_SIZE = 1024; -constexpr int OUT_SIZE = IN_SIZE; - -namespace po = boost::program_options; - -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} - -static inline std::bfloat16_t random_bfloat16_t() { - // Random numbers should NOT be uniformly between 0 and 1, because that - // would make the matrix product AB always close to 1. - return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); -} - -bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { - std::bfloat16_t diff = fabs(a - b); - if ((diff / a) < 0.1) - return true; - else - return false; -} - -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "profile,p", po::value()->default_value(""),"CSV profile")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); - po::variables_map vm; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT test code - // Get a device handle - unsigned int device_index = 0; - auto device = xrt::device(device_index); - - // Load the xclbin - if (verbosity >= 1) - std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; - auto xclbin = xrt::xclbin(vm["xclbin"].as()); - - if (verbosity >= 1) - std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; - std::string Node = vm["kernel"].as(); - - // Get the kernel from the xclbin - auto xkernels = xclbin.get_kernels(); - auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { - auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; - return name.rfind(Node, 0) == 0; - }); - auto kernelName = xkernel.get_name(); - - if (verbosity >= 1) - std::cout << "Registering xclbin: " << vm["xclbin"].as() - << "\n"; - - device.register_xclbin(xclbin); - - // get a hardware context - if (verbosity >= 1) - std::cout << "Getting hardware context.\n"; - xrt::hw_context context(device, xclbin.get_uuid()); - - // get a kernel handle - if (verbosity >= 1) - std::cout << "Getting handle to kernel:" << kernelName << "\n"; - auto kernel = xrt::kernel(context, kernelName); - - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - std::bfloat16_t *bufA = bo_inA.map(); - std::vector AVec(IN_SIZE); - for (int i = 0; i < IN_SIZE; i++) - AVec[i] = random_bfloat16_t()/8.0; - memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t))); - - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - int sticky_errors = 0; - - unsigned num_iter = 64; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - // Lets also benchmark the CPU - float cpu_time_total = 0; - float cpu_time_min = 9999999; - float cpu_time_max = 0; - - - for (unsigned iter = 0; iter < num_iter; iter++) { - - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - - auto start = std::chrono::high_resolution_clock::now(); - - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - std::bfloat16_t *bufOut = bo_out.map(); - - int errors = 0; - - if (VERIFY) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - - std::vector RefVec(IN_SIZE); - auto cpu_start = std::chrono::high_resolution_clock::now(); - - for (uint32_t t = 0; t < IN_SIZE; t+=TILE_SIZE) { - float running = 0.0; - for (uint32_t i = 0; i < TILE_SIZE; i++) { - float ez = (float)(exp(AVec[t+i])); - running += ez; - RefVec[t+i] = exp(AVec[t+i]); - } - - for (uint32_t i = 0; i < TILE_SIZE; i++) { - RefVec[t+i] /= running; - } - } - auto cpu_stop = std::chrono::high_resolution_clock::now(); - float cpu_time = - std::chrono::duration_cast(cpu_stop - cpu_start) - .count(); - - cpu_time_total += cpu_time; - cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min; - cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max; - - - - for (uint32_t i = 0; i < IN_SIZE; i++) { - std::bfloat16_t ref = RefVec[i]; - if (!nearly_equal(*(bufOut + i), ref)) { - std::cout << "Error in " << i << " output " << *(bufOut + i) - << " != " << ref << " actual e^" << AVec[i] << " : " << exp(AVec[i]) << std::endl; - errors++; - sticky_errors++; - } else { - if (verbosity >= 2) - std::cout << "Correct " << i << " output " << *(bufOut + i) - << " == " << ref << std::endl; - } - } - - - - } else { - if (verbosity >= 1) - std::cout << "WARNING: vector-scalar results not verified." - << std::endl; - } - - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - - std::string profile = vm["profile"].as(); - if (profile.length()) { - std::ofstream of; - of.open(profile, std::ios::app); // Append - of << IN_SIZE << "," << TILE_SIZE << "," << npu_time << std::endl; - } - - if (VERIFY) { - if (!errors) { - std::cout << iter << ": pass! in " << npu_time << "us" << std::endl; - } else { - std::cout << iter << ": fail! " << errors << " errors in " << npu_time << "us" << std::endl; - } - } - } - - std::cout << "Avg NPU exec time: " << npu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min NPU exec time: " << npu_time_min << "us." << std::endl; - std::cout << "Max NPU exec time: " << npu_time_max << "us." << std::endl; - - // Let's figure out how many cycles it takes a core to do a single e^x - // There are 4 cores, so the total number of e^x's it does is one quarter of the test size - - int per_core_calcs = IN_SIZE/4; - float avg_npu_time = npu_time_total / num_iter; - float avg_npu_clocks = avg_npu_time/1.0E-3; // Time is in uS, but the AIE is clocked in nS - float clocks_per_calc = avg_npu_clocks/per_core_calcs; - std::cout << "Clocks per calc " << clocks_per_calc << std::endl; - - - - - std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us." - << std::endl; - std::cout << "Min CPU exec time: " << cpu_time_min << "us." << std::endl; - std::cout << "Max CPU exec time: " << cpu_time_max << "us." << std::endl; - - - if (VERIFY) { - if (!sticky_errors) { - std::cout << std::endl << "PASS!" << std::endl << std::endl; - return 0; - } else { - std::cout << std::endl << "FAIL." << std::endl << std::endl; - return 1; - } - } - else { - std::cout << "Verification skipped, but I'm sure it worked. I trust in you" << std::endl; - } - return 0; -} diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/ml/relu/CMakeLists.txt similarity index 100% rename from programming_examples/basic/relu/CMakeLists.txt rename to programming_examples/ml/relu/CMakeLists.txt diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/ml/relu/Makefile similarity index 56% rename from programming_examples/basic/relu/Makefile rename to programming_examples/ml/relu/Makefile index 87e836fbfb..2869ca2976 100644 --- a/programming_examples/basic/relu/Makefile +++ b/programming_examples/ml/relu/Makefile @@ -8,22 +8,22 @@ include ../../makefile-common -all: build/final.xclbin build/insts.txt +all: build/final.xclbin -targetname = testRelu +targetname = myReLU -build/bf16_relu.o: ../../../aie_kernels/relu.cc +build/relu.o: mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c ../$< -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/relu.cc -o ${@F} build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< > $@ -build/final.xclbin: build/aie.mlir build/bf16_relu.o +build/final.xclbin: build/aie.mlir build/relu.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ parse_eventIR_vs.json - -clean_trace: - rm -rf tmpTrace trace.txt - -clean: clean_trace +clean: rm -rf build _build ${targetname}.exe - diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/ml/relu/aie2.py similarity index 97% rename from programming_examples/basic/relu/aie2.py rename to programming_examples/ml/relu/aie2.py index 8204706127..6f3fe40ee0 100644 --- a/programming_examples/basic/relu/aie2.py +++ b/programming_examples/ml/relu/aie2.py @@ -49,7 +49,7 @@ def device_body(): # AIE Core Function declarations - bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty]) + relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty]) # Tile declarations ShimTile = tile(0, 0) @@ -87,7 +87,7 @@ def device_body(): # Set up compute tiles for i in range(n_cores): # Compute tile i - @core(cores[i], "bf16_relu.o") + @core(cores[i], "relu.o") def core_body(): for _ in for_(0xFFFFFFFF): for _ in for_(tiles): @@ -98,7 +98,7 @@ def core_body(): ObjectFifoPort.Consume, 1 ) - call(bf16_relu, [elem_in_a, elem_out]) + call(relu, [elem_in_a, elem_out]) inA_fifos[inA_fifo_names[i]].release( ObjectFifoPort.Consume, 1 diff --git a/programming_examples/ml/relu/run.lit b/programming_examples/ml/relu/run.lit new file mode 100644 index 0000000000..16c48f2aeb --- /dev/null +++ b/programming_examples/ml/relu/run.lit @@ -0,0 +1,11 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/relu.cc -o relu.o +// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/ml/relu/test.cpp b/programming_examples/ml/relu/test.cpp new file mode 100644 index 0000000000..170d90d9fd --- /dev/null +++ b/programming_examples/ml/relu/test.cpp @@ -0,0 +1,246 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using INOUT0_DATATYPE = std::bfloat16_t; +using INOUT1_DATATYPE = std::bfloat16_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Verify results (specific to our design example) +// ---------------------------------------------------------------------------- +template +int verify(int size, std::vector A, std::vector B, int verbosity) { + int errors = 0; + for (uint32_t i = 0; i < size; i++) { + // If the input is nan, lets just say its good + if (isnan(A[i])) + continue; + + T ref = (T)0; + if (A[i] > (T)0) + ref = A[i]; + if (!test_utils::nearly_equal(ref, B[i])) { + std::cout << "Error in output " << B[i] << " != " << ref << " from " + << A[i] << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << B[i] << " == " << ref << std::endl; + } + } + return errors; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + int INOUT0_VOLUME = 65536; // Input + int INOUT1_VOLUME = INOUT0_VOLUME; // Output + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + + size_t OUT_SIZE = INOUT1_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + xrt::device device; + xrt::kernel kernel; + + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + // Assumes trace will only be added to inout1 + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 with ascending bfloat16 raw patterns + // All of them ... + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::vector AVec(INOUT0_VOLUME); + for (int i = 0; i < INOUT0_VOLUME; i++) { + uint16_t raw = (uint16_t)i; + AVec[i] = *(std::bfloat16_t *)(&raw); + } + memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); + + // Initialize Inout buffer 1 with zeros + char *bufInOut1 = bo_inout1.map(); + memset(bufInOut1, 0, OUT_SIZE); // Zeroes out INOUT1_VOLUME + trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + std::vector BVec(INOUT1_VOLUME); + + memcpy(BVec.data(), bufInOut1, (BVec.size() * sizeof(INOUT1_DATATYPE))); + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + errors = verify(INOUT0_VOLUME, AVec, BVec, verbosity); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/vector_softmax/CMakeLists.txt b/programming_examples/ml/vector_softmax/CMakeLists.txt similarity index 100% rename from programming_examples/basic/vector_softmax/CMakeLists.txt rename to programming_examples/ml/vector_softmax/CMakeLists.txt diff --git a/programming_examples/basic/vector_softmax/Makefile b/programming_examples/ml/vector_softmax/Makefile similarity index 89% rename from programming_examples/basic/vector_softmax/Makefile rename to programming_examples/ml/vector_softmax/Makefile index eea6b707f5..4f27c07551 100755 --- a/programming_examples/basic/vector_softmax/Makefile +++ b/programming_examples/ml/vector_softmax/Makefile @@ -12,22 +12,22 @@ targetname = testExp all: build/final.xclbin build/insts.txt -build/vecexp.cc: bf16_softmax.mlir +build/dut.cc: bf16_softmax.mlir mkdir -p ${@D} - cd ${@D} && aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc + cd ${@D} && aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o ${@F} -build/vecexp.o: build/vecexp.cc +build/dut.o: build/dut.cc cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} build/lut_based_ops.o: mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} -build/exp.o: exp.cc +build/softmax.o: softmax.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} -build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o +build/kernels.a: build/softmax.o build/lut_based_ops.o build/dut.o ar rvs $@ $+ build/aie.mlir: aie2.py @@ -63,5 +63,5 @@ clean_trace: rm -rf tmpTrace trace.txt clean: clean_trace - rm -rf build _build ${targetname}.exe vecexp.cc + rm -rf build _build ${targetname}.exe diff --git a/programming_examples/basic/vector_softmax/README.md b/programming_examples/ml/vector_softmax/README.md similarity index 100% rename from programming_examples/basic/vector_softmax/README.md rename to programming_examples/ml/vector_softmax/README.md diff --git a/programming_examples/basic/vector_softmax/aie2.py b/programming_examples/ml/vector_softmax/aie2.py similarity index 95% rename from programming_examples/basic/vector_softmax/aie2.py rename to programming_examples/ml/vector_softmax/aie2.py index 05d4ed5be9..5672819f7a 100755 --- a/programming_examples/basic/vector_softmax/aie2.py +++ b/programming_examples/ml/vector_softmax/aie2.py @@ -46,8 +46,8 @@ def device_body(): # AIE Core Function declarations - exp_bf16_vector = external_func( - "exp_bf16_vector", inputs=[memRef_ty, memRef_ty] + softmax_bf16_vector = external_func( + "softmax_bf16_vector", inputs=[memRef_ty, memRef_ty] ) # Tile declarations @@ -93,7 +93,7 @@ def core_body(): ObjectFifoPort.Consume, 1 ) - call(exp_bf16_vector, [elem_in_a, elem_out]) + call(softmax_bf16_vector, [elem_in_a, elem_out]) inA_fifos[inA_fifo_names[i]].release( ObjectFifoPort.Consume, 1 diff --git a/programming_examples/basic/vector_softmax/bf16_softmax.mlir b/programming_examples/ml/vector_softmax/bf16_softmax.mlir similarity index 100% rename from programming_examples/basic/vector_softmax/bf16_softmax.mlir rename to programming_examples/ml/vector_softmax/bf16_softmax.mlir diff --git a/programming_examples/ml/vector_softmax/run.lit b/programming_examples/ml/vector_softmax/run.lit new file mode 100644 index 0000000000..54c7ccff98 --- /dev/null +++ b/programming_examples/ml/vector_softmax/run.lit @@ -0,0 +1,15 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: aie-opt %S/bf16_softmax.mlir --affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc +// RUN: xchesscc_wrapper aie2 -I %aietools/include -I%S/../../../aie_runtime_lib/AIE2 -c dut.cc -o dut.o +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o lut_based_ops.o +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/softmax.cc -o softmax.o +// RUN: ar rvs kernels.a dut.o lut_based_ops.o softmax.o +// RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/basic/vector_softmax/exp.cc b/programming_examples/ml/vector_softmax/softmax.cc similarity index 84% rename from programming_examples/basic/vector_softmax/exp.cc rename to programming_examples/ml/vector_softmax/softmax.cc index 5fd060c239..6c4f9e27e1 100755 --- a/programming_examples/basic/vector_softmax/exp.cc +++ b/programming_examples/ml/vector_softmax/softmax.cc @@ -19,10 +19,11 @@ #include +// Softmax DUT generated from vector dialect extern void dut(bfloat16 *a_in, bfloat16 *cout); extern "C" { -void exp_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); } +void softmax_bf16_vector(bfloat16 *a_in, bfloat16 *c_out) { dut(a_in, c_out); } } // extern "C" diff --git a/programming_examples/ml/vector_softmax/test.cpp b/programming_examples/ml/vector_softmax/test.cpp new file mode 100644 index 0000000000..94767b139e --- /dev/null +++ b/programming_examples/ml/vector_softmax/test.cpp @@ -0,0 +1,256 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +using INOUT0_DATATYPE = std::bfloat16_t; +using INOUT1_DATATYPE = std::bfloat16_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Verify results (specific to our design example) +// ---------------------------------------------------------------------------- +template +int verify(int size, int tile_size, std::vector A, std::vector B, + int verbosity) { + + int errors = 0; + std::vector RefVec(size); + + for (uint32_t t = 0; t < size; t += tile_size) { + float running = 0.0; + for (uint32_t i = 0; i < tile_size; i++) { + float ez = (float)(exp(A[t + i])); + running += ez; + RefVec[t + i] = exp(A[t + i]); + } + + for (uint32_t i = 0; i < tile_size; i++) { + RefVec[t + i] /= running; + } + } + + for (uint32_t i = 0; i < size; i++) { + + if (!test_utils::nearly_equal(RefVec[i], B[i], 0.03125)) { + std::cout << "Error in output " << B[i] << " != " << RefVec[i] + << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << B[i] << " == " << RefVec[i] + << std::endl; + } + } + return errors; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + int TILE_SIZE = 1024; + int INOUT0_VOLUME = 262144; // Input + int INOUT1_VOLUME = INOUT0_VOLUME; // Output + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + + size_t OUT_SIZE = INOUT1_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + xrt::device device; + xrt::kernel kernel; + + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + // Assumes trace will only be added to inout1 + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 with ascending bfloat16 raw patterns + // All of them ... + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::vector AVec(INOUT0_VOLUME); + for (int i = 0; i < INOUT0_VOLUME; i++) { + AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)8.0, + (std::bfloat16_t)-4.0); + } + memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); + + // Initialize Inout buffer 1 with zeros + char *bufInOut1 = bo_inout1.map(); + memset(bufInOut1, 0, OUT_SIZE); // Zeroes out INOUT1_VOLUME + trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + std::vector BVec(INOUT1_VOLUME); + + memcpy(BVec.data(), bufInOut1, (BVec.size() * sizeof(INOUT1_DATATYPE))); + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + errors = verify(INOUT0_VOLUME, TILE_SIZE, AVec, BVec, verbosity); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +}