diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 9f20d9222c..cb8fe6f5bb 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -509,6 +509,32 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ let extraClassDeclaration = [{ static unsigned getOffsetSizeAndStrideStartOperandIndex(); static std::array getArrayAttrMaxRanks(); + + /* Returns the provided multi-dimensional data transfer strides in units of + address granularity. In the IR, we express strides in units of element + data type, but the hardware requires it in units of address granularity. + Address granularity currently is 4 bytes for all hardware. + + The returned stride[0] is the second-lowest dimension stride, i.e. + stride 1. The lowest stride is currently implicitly one, but this is not + a hardware requirement and could be changed in the future. */ + llvm::SmallVector getStridesInAddressGranularity(); + + /* Returns the multi-dimensional data transfer sizes in units of address + granularity. These sizes are expressed in units of element data type in + the IR, but the hardware requires them to be in units of address + granularity. Address granularity currently is 4 bytes for all hardware. + + The returned size[0] is the lowest dimension size. In the IR, the sizes + are given in reverse order. For example, specifying sizes in IR as + [1, 2, 3, 4] would result in this function returning [4, 3, 2, 1]. + */ + llvm::SmallVector getSizesInAddressGranularity(); + + /* Returns the data transfer offset in bytes, i.e. the first N bytes of the + target buffer will be skipped. In the IR, offsets are expressed in units + of memref element data type size. */ + int64_t getOffsetInBytes(); }]; let extraClassDefinition = [{ diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index e1102b4fe3..0f8ddc5f7c 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -64,10 +64,69 @@ LogicalResult AIEX::BroadcastPacketOp::verify() { return success(); } +llvm::SmallVector +AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() { + const auto &targetModel = AIE::getTargetModel(*this); + MemRefType buffer = getMemref().getType(); + auto elemWidth = buffer.getElementTypeBitWidth(); + auto addressGranularity = targetModel.getAddressGenGranularity(); + llvm::SmallVector strides = + llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) { + return getConstantIntValue(s).value(); + }); + if (!strides.empty()) { + for (int i = 0; i < 3; i++) { + strides[i] = (strides[i] * elemWidth) / addressGranularity; + } + } + return strides; +} + +llvm::SmallVector +AIEX::NpuDmaMemcpyNdOp::getSizesInAddressGranularity() { + const auto &targetModel = AIE::getTargetModel(*this); + MemRefType buffer = getMemref().getType(); + auto elemWidth = buffer.getElementTypeBitWidth(); + auto addressGranularity = targetModel.getAddressGenGranularity(); + llvm::SmallVector sizes = + llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) { + return getConstantIntValue(s).value(); + }); + if (!sizes.empty()) { + sizes[0] = (sizes[0] * elemWidth) / addressGranularity; + } + return sizes; +} + +/* Calculates the offset value to be written to the + */ +int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() { + llvm::SmallVector offsets = + llvm::map_to_vector(llvm::reverse(getMixedOffsets()), [](OpFoldResult s) { + return getConstantIntValue(s).value(); + }); + size_t stride = 1; + size_t offset = 0; + MemRefType my_memref = getMemref().getType(); + auto shape = my_memref.getShape(); + size_t R = shape.size(); + size_t el_bit_width = my_memref.getElementTypeBitWidth(); + assert(el_bit_width % 8 == 0 && + "Expected Memref element bitwidth to be multiple of 8."); + size_t S = el_bit_width / 8; + for (size_t i = 0; i < R; i++) { + offset += offsets[i] * stride * S; + stride *= shape[R - i - 1]; + } + return offset; +} + LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { MemRefType buffer = getMemref().getType(); const auto &targetModel = AIE::getTargetModel(*this); auto addressGranularity = targetModel.getAddressGenGranularity(); + auto elemWidth = buffer.getElementTypeBitWidth(); + if (buffer.getElementTypeBitWidth() > addressGranularity) { return emitOpError("Maximum element bit width allowed is ") << addressGranularity << "bits. "; @@ -79,25 +138,29 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) { return getConstantIntValue(s).has_value(); })) - llvm::report_fatal_error("Only constant strides currently supported."); + return emitOpError("Only constant strides currently supported."); if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) { return getConstantIntValue(s).has_value(); })) - llvm::report_fatal_error("Only constant sizes currently supported."); + return emitOpError("Only constant sizes currently supported."); if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) { return getConstantIntValue(s).has_value(); })) - llvm::report_fatal_error("Only constant offsets currently supported."); + return emitOpError("Only constant offsets currently supported."); - llvm::SmallVector strides = + llvm::SmallVector raw_strides = llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - llvm::SmallVector sizes = + llvm::SmallVector raw_sizes = llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); + llvm::SmallVector strides = getStridesInAddressGranularity(); + llvm::SmallVector sizes = getSizesInAddressGranularity(); + int64_t offset = getOffsetInBytes(); + if (sizes[3] > 64) return emitOpError("Size 3 exceeds the [1:64] range."); if (strides[1] && sizes[1] > 0x3FF) @@ -110,6 +173,36 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { return emitOpError("Stride 2 exceeds the [1:1M] range."); if (strides[0] > 0x100000) return emitOpError("Stride 1 exceeds the [1:1M] range."); + + if (offset % 4 != 0) { + return emitOpError("Offset must be 4-byte-aligned."); + } + + bool error = false; + std::stringstream msg; + for (int i = 0; i < 3; i++) { + if (raw_strides[i] * elemWidth % addressGranularity != 0) { + error = true; + msg << "Stride " << i << " is " << raw_strides[i] << " elements * " + << (elemWidth / 8) << " bytes = " << (raw_strides[i] * elemWidth / 8) + << " bytes, which is not divisible by " << (addressGranularity / 8) + << ". "; + } + } + if (error) { + return emitOpError(msg.str()); + } + + if (raw_sizes[0] * elemWidth % addressGranularity != 0) { + std::stringstream msg; + msg << "Transfer sizes must be multiples of " << (addressGranularity / 8) + << " bytes. " << raw_sizes[0] << " elements at " << (elemWidth / 8) + << " bytes each equal " << (raw_sizes[0] * elemWidth / 8) + << " bytes, which is not divisible by " << (addressGranularity / 8) + << ". "; + return emitOpError(msg.str()); + } + return success(); } diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 94d514c23a..58eb893af7 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -219,31 +219,9 @@ struct DmaToNpuPattern : OpConversionPattern { auto issue_token = BoolAttr::get(ctx, false); auto repeat_count = zero; - llvm::SmallVector strides = llvm::map_to_vector( - llvm::reverse(op.getMixedStrides()), - [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - llvm::SmallVector sizes = llvm::map_to_vector( - llvm::reverse(op.getMixedSizes()), - [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - llvm::SmallVector offsets = llvm::map_to_vector( - llvm::reverse(op.getMixedOffsets()), - [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - - MemRefType buffer = op.getMemref().getType(); - const auto &targetModel = AIE::getTargetModel(op); - auto elemWidth = buffer.getElementTypeBitWidth(); - auto addressGranularity = targetModel.getAddressGenGranularity(); - if (elemWidth < addressGranularity) { - if (!strides.empty()) { - for (int i = 0; i < 3; i++) { - strides[i] = (strides[i] * elemWidth) / addressGranularity; - } - } - if (!sizes.empty()) - sizes[0] = (sizes[0] * elemWidth) / addressGranularity; - if (!offsets.empty()) - offsets[0] = (offsets[0] * elemWidth) / addressGranularity; - } + llvm::SmallVector strides = op.getStridesInAddressGranularity(); + llvm::SmallVector sizes = op.getSizesInAddressGranularity(); + int64_t offset = op.getOffsetInBytes(); // column column = IntegerAttr::get(i32ty, col); @@ -271,19 +249,6 @@ struct DmaToNpuPattern : OpConversionPattern { buffer_length = IntegerAttr::get(i32ty, repeat_length); // buffer_offset - size_t stride = 1; - size_t offset = 0; - MemRefType my_memref = op.getMemref().getType(); - auto shape = my_memref.getShape(); - size_t R = shape.size(); - size_t el_bit_width = my_memref.getElementTypeBitWidth(); - assert(el_bit_width % 8 == 0 && - "Expected Memref element bitwidth to be multiple of 8."); - size_t S = el_bit_width / 8; - for (size_t i = 0; i < R; i++) { - offset += offsets[i] * stride * S; - stride *= shape[R - i - 1]; - } buffer_offset = IntegerAttr::get(i32ty, offset); // enable_packet diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 6b6a48e400..cad1d9f52d 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -231,7 +231,7 @@ def sequence(A, B, C): for tile_row in range(num_tile_rows): A_row_offset = ( ((tile_row_block * rows_per_block) + tile_row) * m * K - ) * 2 + ) npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 4adb1cd7e2..2ecf80cbcb 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -317,7 +317,7 @@ def sequence(A, B, C): C_row_offset = tile_row_block * rows_per_block * m * n_rows * N for i in range(n_cols): C_col_offset = i * n - C_offset = (C_col_offset + C_row_offset) * 2 + C_offset = C_col_offset + C_row_offset npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, @@ -334,8 +334,8 @@ def sequence(A, B, C): * K ) A_col_offset = i * m * K - A_offset = (A_row_offset + A_col_offset) * 2 - B_col_offset = i * n * 2 + A_offset = A_row_offset + A_col_offset + B_col_offset = i * n npu_dma_memcpy_nd( metadata=inA_fifo_names[i], bd_id=2 * tile_row + 1, diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir index c89587b3f1..e49530e325 100644 --- a/test/dialect/AIEX/bad_npu_nd.mlir +++ b/test/dialect/AIEX/bad_npu_nd.mlir @@ -66,3 +66,103 @@ module { // ----- +// Offsets need to be 4-byte aligned. + +module { + aie.device(npu1_4col) { + func.func @bad_npu_nd_stride(%a : memref<8xi8>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c2 = arith.constant 2 : i64 + %c8 = arith.constant 8 : i64 + // expected-error@+1 {{Offset must be 4-byte-aligned}} + aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8> + return + } + aie.shim_dma_allocation @fifo (MM2S, 0, 0) + } +} + +// ----- + +// Strides and sizes expressed in types other than i32 should not overflow hardware limitations when converted to 4-byte granularity. +// The following tests check this. + +module { + aie.device(npu1_4col) { + func.func @bad_npu_nd(%a : memref<8xi8>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c2 = arith.constant 2 : i64 + %c4 = arith.constant 4 : i64 + %c8 = arith.constant 8 : i64 + %c2048 = arith.constant 2048 : i64 + // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s, + // this should be a size of 512 in address granularity (4 bytes) and hence pass the test. + aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + return + } + aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) + } +} + +// ----- + +module { + aie.device(npu1_4col) { + func.func @bad_npu_nd(%a : memref<8xi16>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c2 = arith.constant 2 : i64 + %c4 = arith.constant 4 : i64 + %c8 = arith.constant 8 : i64 + %c2048 = arith.constant 2048 : i64 + // expected-error@+1 {{Size 0 exceeds the [0:1023] range}} + aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> + return + } + aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) + } +} + +// ----- + +// Strides and sizes are expressed at 4-byte-granularity in hardware, but we express them at memref element type granularity. +// The following tests make sure the proper errors are generated when this is not possible. + +module { + aie.device(npu1_4col) { + func.func @bad_npu_nd(%a : memref<8xi8>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c2 = arith.constant 2 : i64 // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible + %c8 = arith.constant 8 : i64 + %c1920 = arith.constant 1920 : i64 + %c1080 = arith.constant 1080 : i64 + // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}} + aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + return + } + aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) + } +} + +// ----- + +module { + aie.device(npu1_4col) { + func.func @bad_npu_nd(%a : memref<8xi8>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c2 = arith.constant 2 : i64 + %c4 = arith.constant 4 : i64 + %c8 = arith.constant 8 : i64 + %c1920 = arith.constant 1920 : i64 + %c1080 = arith.constant 1080 : i64 + // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}} + aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + return + } + aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) + } +} diff --git a/test/npu-xrt/nd_memcpy_transforms/aie2.py b/test/npu-xrt/nd_memcpy_transforms/aie2.py new file mode 100644 index 0000000000..3df95e9493 --- /dev/null +++ b/test/npu-xrt/nd_memcpy_transforms/aie2.py @@ -0,0 +1,118 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 AMD Inc. + +# REQUIRES: ryzen_ai +# +# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o +# RUN: %python %S/aie2.py > %S/aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie2.mlir +# RUN: clang %S/test.cpp -o test -std=c++11 -Wall %xrt_flags -lrt -lstdc++ +# RUN: %run_on_npu ./test | FileCheck %s +# CHECK: PASS! + +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * + + +dtype = T.i16 +a_len = 8 +b_len = 12 +c_offset = 2 +c_len = a_len + b_len + + +def memref_sz(m: MemRefType): + sz = 1 + for s in m.shape: + sz *= s + return sz + + +def design(): + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu1_4col) + def device_body(): + memref_a = T.memref(a_len, dtype()) + memref_b = T.memref(b_len, dtype()) + memref_c = T.memref(c_len, dtype()) + + concat_func = external_func( + "concat", + inputs=[memref_a, memref_b, memref_c, T.i32(), T.i32(), T.i32()], + ) + + # Tile declarations as tile[row][col] + tiles = [[tile(col, row) for col in range(0, 4)] for row in range(0, 6)] + # Shim tiles: tiles[0][0..3] + # Mem tiles: tiles[1][0..3] + # Cores: tiles[2..5][0..3] + + fifo_a = object_fifo("fifo_a", tiles[0][0], tiles[2][0], 2, memref_a) + fifo_b = object_fifo("fifo_b", tiles[0][0], tiles[2][0], 2, memref_b) + fifo_c = object_fifo("fifo_c", tiles[2][0], tiles[0][0], 2, memref_c) + + # Core + @core(tiles[2][0], "kernel.o") + def core_body(): + for _ in for_(0, 0xFFFFFFFF): + elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1) + elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1) + elem_b = fifo_b.acquire(ObjectFifoPort.Consume, 1) + call( + concat_func, + [ + elem_a, + elem_b, + elem_c, + memref_sz(memref_a), + memref_sz(memref_b), + memref_sz(memref_c), + ], + ) + fifo_a.release(ObjectFifoPort.Consume, 1) + fifo_b.release(ObjectFifoPort.Consume, 1) + fifo_c.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + @FuncOp.from_py_func(memref_a, memref_b, memref_c) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=fifo_a.sym_name.value, + bd_id=1, + mem=A, + offsets=[0, 0, 0, 0], + sizes=[1, a_len // 4, 2, 2], + strides=[0, 2, a_len // 2], + ) + npu_dma_memcpy_nd( + metadata=fifo_b.sym_name.value, + bd_id=1, + mem=B, + offsets=[0, 0, 0, 0], + sizes=[1, 2, b_len // 4, 2], + strides=[0, 2, 4], + ) + npu_dma_memcpy_nd( + metadata=fifo_c.sym_name.value, + bd_id=0, + mem=C, + offsets=[0, 0, 0, c_offset], + sizes=[1, 1, 1, c_len], + strides=[0, 0, 0], + ) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +design() diff --git a/test/npu-xrt/nd_memcpy_transforms/kernel.cc b/test/npu-xrt/nd_memcpy_transforms/kernel.cc new file mode 100644 index 0000000000..ef3ac9691a --- /dev/null +++ b/test/npu-xrt/nd_memcpy_transforms/kernel.cc @@ -0,0 +1,20 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +typedef int16_t my_t; + +extern "C" { +void concat(my_t *a, my_t *b, my_t *c, int a_sz, int b_sz, int c_sz) { + // Concatenates a and b and writes the result to c. + int i = 0; + for (; i < c_sz && i < a_sz; i++) { + c[i] = a[i]; + } + for (; i < c_sz && i - a_sz < b_sz; i++) { + c[i] = b[i - a_sz]; + } +} +} diff --git a/test/npu-xrt/nd_memcpy_transforms/test.cpp b/test/npu-xrt/nd_memcpy_transforms/test.cpp new file mode 100644 index 0000000000..646496e08a --- /dev/null +++ b/test/npu-xrt/nd_memcpy_transforms/test.cpp @@ -0,0 +1,143 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +// -------------------------------------------------------------------------- +// AIE Specifics +// -------------------------------------------------------------------------- + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +#ifndef XCLBIN +#define XCLBIN "final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define DTYPE int16_t +#define A_DATATYPE DTYPE +#define B_DATATYPE DTYPE +#define C_DATATYPE DTYPE + +#define A_LEN 8 +#define B_LEN 12 +#define C_OFFSET 2 +#define C_LEN (A_LEN + B_LEN + C_OFFSET) + +#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes +#define B_SIZE (B_LEN * sizeof(B_DATATYPE)) // in bytes +#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes + +int main(int argc, const char *argv[]) { + + std::vector instr_v = load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_a = + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_b = + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c = + xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + A_DATATYPE *buf_a = bo_a.map(); + for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) { + buf_a[i] = 2 * i; // even + } + B_DATATYPE *buf_b = bo_b.map(); + for (int i = 0; i < B_SIZE / sizeof(buf_b[0]); i++) { + buf_b[i] = 2 * i + 1; // odd + } + C_DATATYPE *buf_c = bo_c.map(); + memset(buf_c, 0, C_SIZE); + + // Instruction buffer for DMA configuration + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) { + std::cout << std::setw(4) << (long)buf_c[i] << " "; + } + std::cout << std::endl; + + C_DATATYPE ref[] = {0, 0, 0, 2, 8, 10, 4, 6, 12, 14, 1, + 3, 9, 11, 17, 19, 5, 7, 13, 15, 21, 23}; + if (memcmp(ref, buf_c, sizeof(ref)) == 0) { + std::cout << "PASS!" << std::endl; + } else { + std::cout << "FAIL." << std::endl; + } + + return 0; +}