diff --git a/programming_examples/basic/eltwise_add/add.cc b/programming_examples/basic/eltwise_add/add.cc deleted file mode 100755 index 75a0552ec8..0000000000 --- a/programming_examples/basic/eltwise_add/add.cc +++ /dev/null @@ -1,61 +0,0 @@ -//===- scale.cc -------------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#define __AIENGINE__ 2 -#define NOCPP -#define __AIEARCH__ 20 - -#include -#include -#include -#include - -#include - -template -void eltwise_add(T_in *a, T_in *b, T_out *c) { - for (int i = 0; i < N; i++) { - c[i] = a[i] + b[i]; - } -} - -template -void eltwise_vadd(T_in *a, T_in *b, T_out *c) { - - constexpr int vec_factor = 16; - event0(); - T_in *__restrict pA1 = a; - T_in *__restrict pB1 = b; - T_out *__restrict pC1 = c; - const int F = N / vec_factor; - for (int i = 0; i < F; i++) - chess_prepare_for_pipelining chess_loop_range(16, ) { - aie::vector A0 = aie::load_v(pA1); - pA1 += vec_factor; - aie::vector B0 = aie::load_v(pB1); - pB1 += vec_factor; - aie::vector cout = aie::add(A0, B0); - aie::store_v(pC1, cout); - pC1 += vec_factor; - } - event1(); -} - -extern "C" { - -void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) { - eltwise_add(a_in, b_in, c_out); -} - -void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) { - eltwise_vadd(a_in, b_in, c_out); -} - -} // extern "C" diff --git a/programming_examples/basic/eltwise_mul/CMakeLists.txt b/programming_examples/basic/eltwise_mul/CMakeLists.txt new file mode 100644 index 0000000000..c64f84842b --- /dev/null +++ b/programming_examples/basic/eltwise_mul/CMakeLists.txt @@ -0,0 +1,69 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/eltwise_mul/Makefile b/programming_examples/basic/eltwise_mul/Makefile new file mode 100644 index 0000000000..363f12c4e4 --- /dev/null +++ b/programming_examples/basic/eltwise_mul/Makefile @@ -0,0 +1,47 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../../programming_examples/basic/makefile-common + +all: build/final.xclbin + +targetname = myEltwiseMul + +build/mul.o: + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/mul.cc -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/final.xclbin: build/aie.mlir build/mul.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ + +# Section 3 - My First Program + +In this section, we'll put together what you learend in [section-1](../section-1) for defining a basic strucutral design in python and combine it with the data movement part from [section-2](../section-2) to build our first program. We will then run a simulation on this program as well as run this design on hardware (Ryzen AI). + +* Introduce example of first simple program (Bias Add) + * Walk through syntax of aie2.py, test.cpp, test_utils.h, maybe CMakeLists.txt and Makefile/ makefile-common as well + * need to remove trace parts from test.cpp for now and move it to Section-4 + +* Illustrate how built-in simulation of single core design +* Illustrate how to run designs on Ryzen AI enabled hardware diff --git a/programming_examples/basic/eltwise_mul/aie2.py b/programming_examples/basic/eltwise_mul/aie2.py new file mode 100644 index 0000000000..c5f15a459d --- /dev/null +++ b/programming_examples/basic/eltwise_mul/aie2.py @@ -0,0 +1,151 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + + +def my_eltwise_mul(): + + word_size_in = 2 + N = 65536 + N_in_bytes = N * word_size_in + + A_sz_in_i32s = N_in_bytes // 4 + B_sz_in_i32s = N_in_bytes // 4 + C_sz_in_i32s = N_in_bytes // 4 + + # Tile sizes + n = 1024 + N_div_n = N // n + + n_cores = 2 + tiles = N_div_n // n_cores + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + memRef_ty = T.memref(n, T.bf16()) + + # Type used in the tile memory + memRef_A_ty = T.memref(n, T.bf16()) + memRef_B_ty = T.memref(n, T.bf16()) + memRef_C_ty = T.memref(n, T.bf16()) + + # Type used in the memory tile which aggregates across the 4 cores + memRef_A_MT_ty = T.memref(n * n_cores, T.bf16()) + memRef_B_MT_ty = T.memref(n * n_cores, T.bf16()) + memRef_C_MT_ty = T.memref(n * n_cores, T.bf16()) + + # AIE Core Function declarations + + eltwise_mul_bf16_scalar = external_func( + "eltwise_mul_bf16_scalar", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + eltwise_mul_bf16_vector = external_func( + "eltwise_mul_bf16_vector", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + # elwise_int32 = external_func("scale_int32", inputs=[memRef_ty, memRef_ty]) + + # Tile declarations + ShimTile = tile(0, 0) + + MemTile = tile(0, 1) + cores = [tile(0, 2 + i) for i in range(n_cores)] + + inA_fifo_names = [f"memA{i}" for i in range(n_cores)] + inB_fifo_names = [f"memB{i}" for i in range(n_cores)] + outC_fifo_names = [f"memC{i}" for i in range(n_cores)] + + inA_fifos = {} + inB_fifos = {} + outC_fifos = {} + + # AIE-array data movement with object fifos + # Input A + inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty) + for i in range(n_cores): + inA_fifos[inA_fifo_names[i]] = object_fifo( + inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty + ) + object_fifo_link(inA, inA_fifo_names) + + # Input B + inB = object_fifo("inB", ShimTile, MemTile, buffer_depth, memRef_B_MT_ty) + for i in range(n_cores): + inB_fifos[inB_fifo_names[i]] = object_fifo( + inB_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_B_ty + ) + object_fifo_link(inB, inB_fifo_names[0:n_cores]) + + # Output C + for i in range(n_cores): + outC_fifos[outC_fifo_names[i]] = object_fifo( + outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty + ) + outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty) + object_fifo_link(outC_fifo_names[0:n_cores], outC) + + # Set up compute tiles + for i in range(n_cores): + # Compute tile i + @core(cores[i], "mul.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + for _ in for_(tiles): + elem_out = outC_fifos[outC_fifo_names[i]].acquire( + ObjectFifoPort.Produce, 1 + ) + elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + elem_in_b = inB_fifos[inB_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + + call( + eltwise_mul_bf16_vector, + [elem_in_a, elem_in_b, elem_out], + ) + inA_fifos[inA_fifo_names[i]].release( + ObjectFifoPort.Consume, 1 + ) + inB_fifos[inB_fifo_names[i]].release( + ObjectFifoPort.Consume, 1 + ) + outC_fifos[outC_fifo_names[i]].release( + ObjectFifoPort.Produce, 1 + ) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + ipu_dma_memcpy_nd( + metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] + ) + ipu_dma_memcpy_nd( + metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] + ) + ipu_dma_memcpy_nd( + metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_eltwise_mul() diff --git a/programming_examples/basic/eltwise_mul/test.cpp b/programming_examples/basic/eltwise_mul/test.cpp new file mode 100644 index 0000000000..c117c60c8f --- /dev/null +++ b/programming_examples/basic/eltwise_mul/test.cpp @@ -0,0 +1,297 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using INOUT0_DATATYPE = std::bfloat16_t; +using INOUT1_DATATYPE = std::bfloat16_t; +using INOUT2_DATATYPE = std::bfloat16_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Verify results (specific to our design example) +// ---------------------------------------------------------------------------- +template +int verify(int size, std::vector A, std::vector B, std::vector C, + int verbosity) { + int errors = 0; + for (uint32_t i = 0; i < size; i++) { + T ref = A[i] * B[i]; + if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) { + std::cout << "Error in output " << C[i] << " != " << ref << " from " + << A[i] << " * " << B[i] << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << C[i] << " == " << ref << std::endl; + } + } + return errors; +} + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 65536; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = INOUT0_VOLUME; // Not used in this example + int INOUT2_VOLUME = + INOUT0_VOLUME; // Output only, 64x uint32_t in this example + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); + + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT2_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Load the kernel + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + // Assumes trace will only be added to inout2 + auto bo_inout2 = + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::vector AVec(INOUT0_VOLUME); + for (int i = 0; i < INOUT0_VOLUME; i++) + AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, + (std::bfloat16_t)-0.5); + memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); + + // Initialize Inout buffer 1 + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + std::vector BVec(INOUT1_VOLUME); + for (int i = 0; i < INOUT1_VOLUME; i++) + BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, + (std::bfloat16_t)-0.5); + memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); + + // Initialize Inout buffer 2 + char *bufInOut2 = bo_inout2.map(); + std::vector CVec(INOUT2_VOLUME); + memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = + kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + errors = verify(INOUT0_VOLUME, AVec, BVec, CVec, verbosity); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/eltwise_mul/test.py b/programming_examples/basic/eltwise_mul/test.py new file mode 100644 index 0000000000..cc132020d9 --- /dev/null +++ b/programming_examples/basic/eltwise_mul/test.py @@ -0,0 +1,157 @@ +# test.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# import argparse +import numpy as np +import pyxrt as xrt +import sys +import time + +sys.path.append("../../programming_examples/utils") +import test_utils + +# ------------------------------------------------------ +# Configure this to match your design's buffer size +# ------------------------------------------------------ +INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example +INOUT1_VOLUME = 64 # Not used in this example +INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example + +INOUT0_DATATYPE = np.uint32 +INOUT1_DATATYPE = np.uint32 +INOUT2_DATATYPE = np.uint32 + +INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize +INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize +INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize + + +def main(opts): + + # Load instruction sequence + with open(opts.instr, "r") as f: + instr_text = f.read().split("\n") + instr_text = [l for l in instr_text if l != ""] + instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + + # Get a device handle + device = xrt.device(0) + + # Load the xclbin + xclbin = xrt.xclbin(opts.xclbin) + + # Load the kernel + kernels = xclbin.get_kernels() + try: + xkernel = [k for k in kernels if opts.kernel in k.get_name()][0] + except: + print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'") + exit(-1) + + # Register xclbin + device.register_xclbin(xclbin) + + # Get a hardware context + context = xrt.hw_context(device, xclbin.get_uuid()) + + # get a kernel handle + kernel = xrt.kernel(context, xkernel.get_name()) + + # ------------------------------------------------------ + # Initialize input/ output buffer sizes and sync them + # ------------------------------------------------------ + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) + + # Initialize instruction buffer + bo_instr.write(instr_v, 0) + + # Initialize data buffers + inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) + inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) + inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) + bo_inout0.write(inout0, 0) + bo_inout1.write(inout1, 0) + bo_inout2.write(inout2, 0) + + # Sync buffers to update input buffer values + bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # ------------------------------------------------------ + # Initialize run configs + # ------------------------------------------------------ + num_iter = opts.iters + opts.warmup_iters + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + errors = 0 + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + # Run kernel + if opts.verbosity >= 1: + print("Running Kernel.") + start = time.time_ns() + h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + h.wait() + stop = time.time_ns() + bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + + # Warmup iterations do not count towards average runtime. + if i < opts.warmup_iters: + continue + + # Copy output results and verify they are correct + out_size = INOUT2_SIZE + opts.trace_size + output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) + if opts.verify: + if opts.verbosity >= 1: + print("Verifying results ...") + ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) + e = np.equal(output_buffer, ref) + errors = errors + np.size(e) - np.count_nonzero(e) + + # Write trace values if trace_size > 0 + if opts.trace_size > 0: + print("Do something with trace!") + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + npu_time_min = min(npu_time_min, npu_time) + npu_time_max = max(npu_time_max, npu_time) + + # ------------------------------------------------------ + # Print verification and timing results + # ------------------------------------------------------ + + # TODO - Mac count to guide gflops + + print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000))) + print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000))) + print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000))) + + if not errors: + print("\nPASS!\n") + exit(0) + else: + print("\nError count: ", errors) + print("\nFailed.\n") + exit(-1) + + +if __name__ == "__main__": + opts = test_utils.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/basic/eltwise_add/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt similarity index 100% rename from programming_examples/basic/eltwise_add/CMakeLists.txt rename to programming_examples/basic/relu/CMakeLists.txt diff --git a/programming_examples/basic/eltwise_add/Makefile b/programming_examples/basic/relu/Makefile old mode 100755 new mode 100644 similarity index 77% rename from programming_examples/basic/eltwise_add/Makefile rename to programming_examples/basic/relu/Makefile index 98e5a70f1f..f862aad403 --- a/programming_examples/basic/eltwise_add/Makefile +++ b/programming_examples/basic/relu/Makefile @@ -8,19 +8,19 @@ include ../makefile-common -targetname = eltwiseAdd +targetname = testRelu all: build/final.xclbin build/insts.txt -build/%.o: %.cc +build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F} build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< > $@ -build/final.xclbin: build/aie.mlir build/add.o +build/final.xclbin: build/aie.mlir build/bf16_relu.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) @@ -39,6 +39,10 @@ endif run: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE +run_g: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536 + + trace: ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json diff --git a/programming_examples/basic/relu/aie2.py b/programming_examples/basic/relu/aie2.py new file mode 100644 index 0000000000..8204706127 --- /dev/null +++ b/programming_examples/basic/relu/aie2.py @@ -0,0 +1,209 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + + +def my_relu(): + + word_size_in = 2 + N = 65536 + N_in_bytes = N * word_size_in + + A_sz_in_i32s = N_in_bytes // 4 + C_sz_in_i32s = N_in_bytes // 4 + + enable_tracing = True + trace_size = 65536 + + # Tile sizes + n = 1024 + N_div_n = N // n + + n_cores = 2 + tiles = N_div_n // n_cores + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + memRef_ty = T.memref(n, T.bf16()) + + # Type used in the tile memory + memRef_A_ty = T.memref(n, T.bf16()) + memRef_C_ty = T.memref(n, T.bf16()) + + # Type used in the memory tile which aggregates across the 4 cores + memRef_A_MT_ty = T.memref(n * n_cores, T.bf16()) + memRef_C_MT_ty = T.memref(n * n_cores, T.bf16()) + + # AIE Core Function declarations + + bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty]) + + # Tile declarations + ShimTile = tile(0, 0) + + MemTile = tile(0, 1) + cores = [tile(0, 2 + i) for i in range(n_cores)] + + inA_fifo_names = [f"memA{i}" for i in range(n_cores)] + outC_fifo_names = [f"memC{i}" for i in range(n_cores)] + + inA_fifos = {} + outC_fifos = {} + + # AIE-array data movement with object fifos + # Input A + inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty) + for i in range(n_cores): + inA_fifos[inA_fifo_names[i]] = object_fifo( + inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty + ) + object_fifo_link(inA, inA_fifo_names) + + # Output C + for i in range(n_cores): + outC_fifos[outC_fifo_names[i]] = object_fifo( + outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty + ) + outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty) + object_fifo_link(outC_fifo_names[0:n_cores], outC) + + # Set up a circuit-switched flow from core to shim for tracing information + if enable_tracing: + flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) + + # Set up compute tiles + for i in range(n_cores): + # Compute tile i + @core(cores[i], "bf16_relu.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + for _ in for_(tiles): + elem_out = outC_fifos[outC_fifo_names[i]].acquire( + ObjectFifoPort.Produce, 1 + ) + elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + + call(bf16_relu, [elem_in_a, elem_out]) + + inA_fifos[inA_fifo_names[i]].release( + ObjectFifoPort.Consume, 1 + ) + outC_fifos[outC_fifo_names[i]].release( + ObjectFifoPort.Produce, 1 + ) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): + + # Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md + if enable_tracing: + # 0x340D0: Trace Control 0 + # 0xAABB---C + # AA <- Event to stop trace capture + # BB <- Event to start trace capture + # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution + # Configure so that "Event 1" (always true) causes tracing to start + ipu_write32( + column=0, + row=2, + address=0x340D0, + value=0x00010000, + ) + # 0x340D4: Trace Control 1 + ipu_write32( + column=0, + row=2, + address=0x340D4, + value=0x00000000, + ) + # 0x340E0: Trace Event Group 1 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=0, + row=2, + address=0x340E0, + value=0x00222100, + ) + # 0x340E4: Trace Event Group 2 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=0, + row=2, + address=0x340E4, + value=0x00000000, + ) + + ipu_write32( + column=0, + row=2, + address=0x3FF00, + value=0x00000121, + ) + + # Configure a buffer descriptor to write tracing information that has been routed into this shim tile + # out to host DDR memory + trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory + output_size = N_in_bytes + ipu_writebd_shimtile( + bd_id=trace_bd_id, + buffer_length=trace_size, + buffer_offset=output_size, + enable_packet=0, + out_of_order_id=0, + packet_id=0, + packet_type=0, + column=0, + column_num=1, + d0_size=0, + d0_stride=0, + d1_size=0, + d1_stride=0, + d2_stride=0, + ddr_id=1, + iteration_current=0, + iteration_size=0, + iteration_stride=0, + lock_acq_enable=0, + lock_acq_id=0, + lock_acq_val=0, + lock_rel_id=0, + lock_rel_val=0, + next_bd=0, + use_next_bd=0, + valid_bd=1, + ) + # Set start BD to our shim bd_Id (13) + ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) + + ipu_dma_memcpy_nd( + metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] + ) + ipu_dma_memcpy_nd( + metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_relu() diff --git a/programming_examples/basic/eltwise_add/test.cpp b/programming_examples/basic/relu/test.cpp similarity index 65% rename from programming_examples/basic/eltwise_add/test.cpp rename to programming_examples/basic/relu/test.cpp index 67ca9d2e97..14bb24babe 100644 --- a/programming_examples/basic/eltwise_add/test.cpp +++ b/programming_examples/basic/relu/test.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -45,15 +46,24 @@ void check_arg_file_exists(po::variables_map &vm_in, std::string name) { } } -static inline std::bfloat16_t random_bfloat16_t() { +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { + std::ofstream fout(path); + uint32_t *traceOut = (uint32_t *)traceOutPtr; + for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { + fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; + fout << std::endl; + } +} + +static inline std::bfloat16_t random_bfloat16_t(float scale, float bias) { // Random numbers should NOT be uniformly between 0 and 1, because that // would make the matrix product AB always close to 1. - return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); + return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) - bias); } bool nearly_equal(std::bfloat16_t a, std::bfloat16_t b) { std::bfloat16_t diff = fabs(a - b); - if ((diff / a) < 0.01) + if ((diff / 4.0) < 0.001) return true; else return false; @@ -84,8 +94,12 @@ int main(int argc, const char *argv[]) { "the input xclbin path")( "kernel,k", po::value()->required(), "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( + "trace_sz,t", po::value()->default_value(0), + "the depth of the trace buffer")( + "trace_file,f", po::value()->default_value("trace.txt"), + "the output trace path")("verbosity,v", + po::value()->default_value(0), + "the verbosity of the output")( "instr,i", po::value()->required(), "path of file containing userspace instructions to be sent to the LX6"); po::variables_map vm; @@ -114,6 +128,8 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + int trace_size = vm["trace_sz"].as(); + // Start the XRT test code // Get a device handle unsigned int device_index = 0; @@ -158,10 +174,10 @@ int main(int argc, const char *argv[]) { XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(std::bfloat16_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + auto real_out_size = OUT_SIZE * sizeof(std::bfloat16_t) + trace_size; + auto bo_out = xrt::bo(device, real_out_size, XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; @@ -169,25 +185,18 @@ int main(int argc, const char *argv[]) { std::bfloat16_t *bufA = bo_inA.map(); std::vector AVec(IN_SIZE); for (int i = 0; i < IN_SIZE; i++) - AVec[i] = random_bfloat16_t(); + AVec[i] = random_bfloat16_t(4.0, 2.0); memcpy(bufA, AVec.data(), (AVec.size() * sizeof(std::bfloat16_t))); - std::bfloat16_t *bufB = bo_inB.map(); - std::vector BVec(IN_SIZE); - for (int i = 0; i < IN_SIZE; i++) - BVec[i] = random_bfloat16_t(); - memcpy(bufB, BVec.data(), (BVec.size() * sizeof(std::bfloat16_t))); - void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); int sticky_errors = 0; - unsigned num_iter = 256; + unsigned num_iter = 2; float npu_time_total = 0; float npu_time_min = 9999999; float npu_time_max = 0; @@ -198,7 +207,7 @@ int main(int argc, const char *argv[]) { auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); @@ -213,11 +222,13 @@ int main(int argc, const char *argv[]) { std::cout << "Verifying results ..." << std::endl; } for (uint32_t i = 0; i < IN_SIZE; i++) { - std::bfloat16_t ref = AVec[i] + BVec[i]; + std::bfloat16_t ref = 0.0; + if (AVec[i] > 0.0) + ref = AVec[i]; if (!nearly_equal(*(bufOut + i), ref)) { std::cout << "Error in " << i << " output " << *(bufOut + i) - << " != " << ref << " actual " << AVec[i] << " + " - << BVec[i] << std::endl; + << " != " << ref << " actual max(" << AVec[i] << ", 0.0" + << std::endl; errors++; sticky_errors++; } else { @@ -240,10 +251,17 @@ int main(int argc, const char *argv[]) { npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - if (VERIFY && !errors) { - std::cout << iter << ": pass!\n"; - } else { - std::cout << iter << ": fail! " << errors << " errors\n"; + if (trace_size > 0) { + write_out_trace(((char *)bufOut) + (OUT_SIZE * 2), trace_size, + vm["trace_file"].as()); + } + + if (VERIFY) { + if (!errors) { + std::cout << iter << ": pass!\n"; + } else { + std::cout << iter << ": fail! " << errors << " errors\n"; + } } } @@ -252,11 +270,57 @@ int main(int argc, const char *argv[]) { std::cout << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; std::cout << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; - if (VERIFY && !sticky_errors) { - std::cout << "\nPASS!\n\n"; - return 0; + // Let's figure out how many cycles it takes a core to do a single e^x + // There are 4 cores, so the total number of e^x's it does is one quarter of + // the test size + + int per_core_calcs = IN_SIZE / 4; + float avg_npu_time = npu_time_total / num_iter; + float avg_npu_clocks = + avg_npu_time / 1.0E-3; // Time is in uS, but the AIE is clocked in nS + float clocks_per_calc = avg_npu_clocks / per_core_calcs; + std::cout << "Clocks per calc " << clocks_per_calc << std::endl; + + // Lets benchmark the CPU + float cpu_time_total = 0; + float cpu_time_min = 9999999; + float cpu_time_max = 0; + for (unsigned iter = 0; iter < num_iter; iter++) { + + std::vector AVec(IN_SIZE); + std::vector ResVec(IN_SIZE); + for (int i = 0; i < IN_SIZE; i++) { + AVec[i] = random_bfloat16_t(4.0, 2.0); + } + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < IN_SIZE; i++) { + ResVec[i] = exp(AVec[i]); + } + auto stop = std::chrono::high_resolution_clock::now(); + float cpu_time = + std::chrono::duration_cast(stop - start) + .count(); + + cpu_time_total += cpu_time; + cpu_time_min = (cpu_time < cpu_time_min) ? cpu_time : cpu_time_min; + cpu_time_max = (cpu_time > cpu_time_max) ? cpu_time : cpu_time_max; + } + std::cout << "Avg CPU exec time: " << cpu_time_total / num_iter << "us." + << std::endl; + std::cout << "Min CPU matmul time: " << cpu_time_min << "us." << std::endl; + std::cout << "Max CPU matmul time: " << cpu_time_max << "us." << std::endl; + + if (VERIFY) { + if (!sticky_errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl << "FAIL." << std::endl << std::endl; + return 1; + } } else { - std::cout << "\nFAIL.\n\n"; - return 1; + std::cout << "Verification skipped, but I'm sure it worked. I trust in you" + << std::endl; } + return 0; } diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt new file mode 100644 index 0000000000..c64f84842b --- /dev/null +++ b/programming_examples/ml/eltwise_add/CMakeLists.txt @@ -0,0 +1,69 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile new file mode 100644 index 0000000000..dd75274321 --- /dev/null +++ b/programming_examples/ml/eltwise_add/Makefile @@ -0,0 +1,47 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../../programming_examples/basic/makefile-common + +all: build/final.xclbin + +targetname = myEltwiseAdd + +build/add.o: + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/add.cc -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/final.xclbin: build/aie.mlir build/add.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ + +# Section 3 - My First Program + +In this section, we'll put together what you learend in [section-1](../section-1) for defining a basic strucutral design in python and combine it with the data movement part from [section-2](../section-2) to build our first program. We will then run a simulation on this program as well as run this design on hardware (Ryzen AI). + +* Introduce example of first simple program (Bias Add) + * Walk through syntax of aie2.py, test.cpp, test_utils.h, maybe CMakeLists.txt and Makefile/ makefile-common as well + * need to remove trace parts from test.cpp for now and move it to Section-4 + +* Illustrate how built-in simulation of single core design +* Illustrate how to run designs on Ryzen AI enabled hardware diff --git a/programming_examples/basic/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py old mode 100755 new mode 100644 similarity index 100% rename from programming_examples/basic/eltwise_add/aie2.py rename to programming_examples/ml/eltwise_add/aie2.py diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp new file mode 100644 index 0000000000..eb38eeb1de --- /dev/null +++ b/programming_examples/ml/eltwise_add/test.cpp @@ -0,0 +1,297 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using INOUT0_DATATYPE = std::bfloat16_t; +using INOUT1_DATATYPE = std::bfloat16_t; +using INOUT2_DATATYPE = std::bfloat16_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Verify results (specific to our design example) +// ---------------------------------------------------------------------------- +template +int verify(int size, std::vector A, std::vector B, std::vector C, + int verbosity) { + int errors = 0; + for (uint32_t i = 0; i < size; i++) { + T ref = A[i] + B[i]; + if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) { + std::cout << "Error in output " << C[i] << " != " << ref << " from " + << A[i] << " + " << B[i] << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << C[i] << " == " << ref << std::endl; + } + } + return errors; +} + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 65536; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = INOUT0_VOLUME; // Not used in this example + int INOUT2_VOLUME = + INOUT0_VOLUME; // Output only, 64x uint32_t in this example + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); + + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT2_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Load the kernel + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + // Assumes trace will only be added to inout2 + auto bo_inout2 = + xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::vector AVec(INOUT0_VOLUME); + for (int i = 0; i < INOUT0_VOLUME; i++) + AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, + (std::bfloat16_t)-0.5); + memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); + + // Initialize Inout buffer 1 + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + std::vector BVec(INOUT1_VOLUME); + for (int i = 0; i < INOUT1_VOLUME; i++) + BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, + (std::bfloat16_t)-0.5); + memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); + + // Initialize Inout buffer 2 + char *bufInOut2 = bo_inout2.map(); + std::vector CVec(INOUT2_VOLUME); + memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = + kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + errors = verify(INOUT0_VOLUME, AVec, BVec, CVec, verbosity); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/ml/eltwise_add/test.py b/programming_examples/ml/eltwise_add/test.py new file mode 100644 index 0000000000..cc132020d9 --- /dev/null +++ b/programming_examples/ml/eltwise_add/test.py @@ -0,0 +1,157 @@ +# test.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# import argparse +import numpy as np +import pyxrt as xrt +import sys +import time + +sys.path.append("../../programming_examples/utils") +import test_utils + +# ------------------------------------------------------ +# Configure this to match your design's buffer size +# ------------------------------------------------------ +INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example +INOUT1_VOLUME = 64 # Not used in this example +INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example + +INOUT0_DATATYPE = np.uint32 +INOUT1_DATATYPE = np.uint32 +INOUT2_DATATYPE = np.uint32 + +INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize +INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize +INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize + + +def main(opts): + + # Load instruction sequence + with open(opts.instr, "r") as f: + instr_text = f.read().split("\n") + instr_text = [l for l in instr_text if l != ""] + instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + + # Get a device handle + device = xrt.device(0) + + # Load the xclbin + xclbin = xrt.xclbin(opts.xclbin) + + # Load the kernel + kernels = xclbin.get_kernels() + try: + xkernel = [k for k in kernels if opts.kernel in k.get_name()][0] + except: + print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'") + exit(-1) + + # Register xclbin + device.register_xclbin(xclbin) + + # Get a hardware context + context = xrt.hw_context(device, xclbin.get_uuid()) + + # get a kernel handle + kernel = xrt.kernel(context, xkernel.get_name()) + + # ------------------------------------------------------ + # Initialize input/ output buffer sizes and sync them + # ------------------------------------------------------ + bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) + bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) + bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) + bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) + + # Initialize instruction buffer + bo_instr.write(instr_v, 0) + + # Initialize data buffers + inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) + inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) + inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) + bo_inout0.write(inout0, 0) + bo_inout1.write(inout1, 0) + bo_inout2.write(inout2, 0) + + # Sync buffers to update input buffer values + bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # ------------------------------------------------------ + # Initialize run configs + # ------------------------------------------------------ + num_iter = opts.iters + opts.warmup_iters + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + errors = 0 + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + # Run kernel + if opts.verbosity >= 1: + print("Running Kernel.") + start = time.time_ns() + h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) + h.wait() + stop = time.time_ns() + bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + + # Warmup iterations do not count towards average runtime. + if i < opts.warmup_iters: + continue + + # Copy output results and verify they are correct + out_size = INOUT2_SIZE + opts.trace_size + output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) + if opts.verify: + if opts.verbosity >= 1: + print("Verifying results ...") + ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) + e = np.equal(output_buffer, ref) + errors = errors + np.size(e) - np.count_nonzero(e) + + # Write trace values if trace_size > 0 + if opts.trace_size > 0: + print("Do something with trace!") + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + npu_time_min = min(npu_time_min, npu_time) + npu_time_max = max(npu_time_max, npu_time) + + # ------------------------------------------------------ + # Print verification and timing results + # ------------------------------------------------------ + + # TODO - Mac count to guide gflops + + print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000))) + print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000))) + print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000))) + + if not errors: + print("\nPASS!\n") + exit(0) + else: + print("\nError count: ", errors) + print("\nFailed.\n") + exit(-1) + + +if __name__ == "__main__": + opts = test_utils.parse_args(sys.argv[1:]) + main(opts)