From cd2345a95a3589341b29ef5d4dd239762ca692b4 Mon Sep 17 00:00:00 2001 From: Kristof Denolf Date: Wed, 10 Apr 2024 14:53:28 -0600 Subject: [PATCH] basic passthrough_kernel --- .../aie_generic}/passThrough.cc | 13 +- .../basic/passthrough_kernel/CMakeLists.txt | 75 ++++++++ .../basic/passthrough_kernel/Makefile | 49 +++++ .../basic/passthrough_kernel/aie2.py | 169 ++++++++++++++++++ .../basic/passthrough_kernel/test.cpp | 130 ++++++++++++++ 5 files changed, 424 insertions(+), 12 deletions(-) rename {programming_examples/vision/vision_kernels => aie_kernels/aie_generic}/passThrough.cc (86%) create mode 100644 programming_examples/basic/passthrough_kernel/CMakeLists.txt create mode 100644 programming_examples/basic/passthrough_kernel/Makefile create mode 100644 programming_examples/basic/passthrough_kernel/aie2.py create mode 100644 programming_examples/basic/passthrough_kernel/test.cpp diff --git a/programming_examples/vision/vision_kernels/passThrough.cc b/aie_kernels/aie_generic/passThrough.cc similarity index 86% rename from programming_examples/vision/vision_kernels/passThrough.cc rename to aie_kernels/aie_generic/passThrough.cc index 0928af33f0..0fa7123ca9 100644 --- a/programming_examples/vision/vision_kernels/passThrough.cc +++ b/aie_kernels/aie_generic/passThrough.cc @@ -15,29 +15,18 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include template -__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, +__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, const int32_t height, const int32_t width) { - //::aie::vector data_out; - //::aie::mask temp_val; v64uint8 *restrict outPtr = (v64uint8 *)out; v64uint8 *restrict inPtr = (v64uint8 *)in; for (int j = 0; j < (height * width); j += N) // Nx samples per loop chess_prepare_for_pipelining chess_loop_range(6, ) { - //::aie::vector tmpVector = ::aie::load_v(in); - //::aie::store_v(out, tmpVector); - *outPtr++ = *inPtr++; - - // in += N; - // out += N; } } diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt new file mode 100644 index 0000000000..483fc84fb9 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt @@ -0,0 +1,75 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Xilinx Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DOpenCV_DIR: Path to OpenCV install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif () + +set(PASSTHROUGH_SIZE 4096 CACHE STRING "size") +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/xrtUtils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC + PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE} + DISABLE_ABI_CHECK=1 + ) + +target_include_directories (${currentTarget} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile new file mode 100644 index 0000000000..9fea098d84 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -0,0 +1,49 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../makefile-common + +VPATH := ../../../aie_kernels/aie_generic + +PASSTHROUGH_SIZE = 4096 + +targetname = passthrough_kernel + +.PHONY: all template clean + +all: build/final_${PASSTHROUGH_SIZE}.xclbin + +build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${PASSTHROUGH_SIZE} > $@ + +build/passThrough.cc.o: passThrough.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + +build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt + ${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE + +clean: + rm -rf build _build ${targetname}.exe diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py new file mode 100644 index 0000000000..cb5877c4bc --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -0,0 +1,169 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + +width = 1024 + +if len(sys.argv) == 2: + width = int(sys.argv[1]) + +lineWidthInBytes = width +lineWidthInInt32s = lineWidthInBytes // 4 + +enableTrace = False +traceSizeInBytes = 8192 +traceSizeInInt32s = traceSizeInBytes // 4 + + +def passThroughAIE2(): + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + # define types + line_ty = T.memref(lineWidthInBytes, T.ui8()) + + # AIE Core Function declarations + passThroughLine = external_func( + "passThroughLine", inputs=[line_ty, line_ty, T.i32()] + ) + + # Tile declarations + ShimTile = tile(0, 0) + ComputeTile2 = tile(0, 2) + + if enableTrace: + flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "passThrough.cc.o") + def core_body(): + for _ in for_(sys.maxsize): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) + call(passThroughLine, [elemIn, elemOut, width]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # print(ctx.module.operation.verify()) + + tensorSize = width + tensorSizeInInt32s = tensorSize // 4 + tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) + def sequence(inTensor, outTensor, notUsed): + if enableTrace: + # Trace output + + # Trace_Event0, Trace_Event1: Select which events to trace. + # Note that the event buffers only appear to be transferred to DDR in + # bursts of 256 bytes. If less than 256 bytes are written, you may not + # see trace output, or only see it on the next iteration of your + # kernel invocation, as the buffer gets filled up. Note that, even + # though events are encoded as 4 byte words, it may take more than 64 + # events to fill the buffer to 256 bytes and cause a flush, since + # multiple repeating events can be 'compressed' by the trace mechanism. + # In order to always generate sufficient events, we add the "assert + # TRUE" event to one slot, which fires every cycle, and thus fills our + # buffer quickly. + + # Some events: + # TRUE (0x01) + # STREAM_STALL (0x18) + # LOCK_STALL (0x1A) + # EVENTS_CORE_INSTR_EVENT_1 (0x22) + # EVENTS_CORE_INSTR_EVENT_0 (0x21) + # INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction + # INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction + # INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction + # EVENTS_CORE_PORT_RUNNING_1 (0x4F) + # EVENTS_CORE_PORT_RUNNING_0 (0x4B) + + # Trace_Event0 (4 slots) + IpuWrite32(0, 2, 0x340E0, 0x4B222125) + # Trace_Event1 (4 slots) + IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F) + + # Event slots as configured above: + # 0: Kernel executes vector instruction + # 1: Event 0 -- Kernel starts + # 2: Event 1 -- Kernel done + # 3: Port_Running_0 + # 4: Port_Running_1 + # 5: Lock Stall + # 6: Lock Acquire Instr + # 7: Lock Release Instr + + # Stream_Switch_Event_Port_Selection_0 + # This is necessary to capture the Port_Running_0 and Port_Running_1 events + IpuWrite32(0, 2, 0x3FF00, 0x121) + + # Trace_Control0: Define trace start and stop triggers. Set start event TRUE. + IpuWrite32(0, 2, 0x340D0, 0x10000) + + # Start trace copy out. + IpuWriteBdShimTile( + bd_id=3, + buffer_length=traceSizeInBytes, + buffer_offset=tensorSize, + enable_packet=0, + out_of_order_id=0, + packet_id=0, + packet_type=0, + column=0, + column_num=1, + d0_stride=0, + d0_wrap=0, + d1_stride=0, + d1_wrap=0, + d2_stride=0, + ddr_id=2, + iteration_current=0, + iteration_stride=0, + iteration_wrap=0, + lock_acq_enable=0, + lock_acq_id=0, + lock_acq_val=0, + lock_rel_id=0, + lock_rel_val=0, + next_bd=0, + use_next_bd=0, + valid_bd=1, + ) + IpuWrite32(0, 0, 0x1D20C, 0x3) + + ipu_dma_memcpy_nd( + metadata="in", + bd_id=0, + mem=inTensor, + sizes=[1, 1, 1, tensorSizeInInt32s], + ) + ipu_dma_memcpy_nd( + metadata="out", + bd_id=1, + mem=outTensor, + sizes=[1, 1, 1, tensorSizeInInt32s], + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + +passThroughAIE2() diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp new file mode 100644 index 0000000000..f9fe19281f --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/test.cpp @@ -0,0 +1,130 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrtUtils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using DATATYPE = std::uint8_t; +#endif + +namespace po = boost::program_options; + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options() + ("help,h", "produce help message") + ("xclbin,x", po::value()->required(), "the input xclbin path") + ("kernel,k", po::value()->required(), "the kernel name in the XCLBIN (for instance PP_PRE_FD)") + ("verbosity,v", po::value()->default_value(0), "the verbosity of the output") + ("instr,i", po::value()->required(), "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + try { + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + } + + // Load instruction sequence + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT context and load the kernel + xrt::device device; + xrt::kernel kernel; + + initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), vm["kernel"].as()); + + // set up the buffer objects + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_out = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize buffer bo_inA + DATATYPE *bufInA = bo_inA.map(); + for (int i = 0; i < PASSTHROUGH_SIZE; i++) + bufInA[i] = i; + + // Zero out buffer bo_out + DATATYPE *bufOut = bo_out.map(); + memset(bufOut, 0, PASSTHROUGH_SIZE * sizeof(DATATYPE)); + + // sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Execute the kernel and wait to finish + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); + run.wait(); + + // Sync device to host memories + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Compare out to in + int numberOfDifferences = 0; + for(int i = 0; i < PASSTHROUGH_SIZE; i++) { + if(bufOut[i] != bufInA[i]) + numberOfDifferences++; + } + + // Print Pass/Fail result of our test + int res = 0; + if (numberOfDifferences == 0) { + printf("PASS!\n"); + res = 0; + } else { + printf("Fail!\n"); + res = -1; + } + + printf("Testing passThrough done!\n"); + return res; +}