Skip to content

Commit

Permalink
POC multiple kernels in one xclbin (#1508)
Browse files Browse the repository at this point in the history
  • Loading branch information
fifield authored May 28, 2024
1 parent 6338bc5 commit b693d4e
Show file tree
Hide file tree
Showing 6 changed files with 363 additions and 1 deletion.
6 changes: 6 additions & 0 deletions python/compiler/aiecc/cl_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@ def parse_args(args=None):
const=True,
help="Generate xclbin",
)
parser.add_argument(
"--xclbin-input",
dest="xclbin_input",
default=None,
help="Generate kernel into existing xclbin file",
)
parser.add_argument(
"--link_against_hsa",
dest="link_against_hsa",
Expand Down
20 changes: 19 additions & 1 deletion python/compiler/aiecc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,25 @@ async def process_xclbin_gen(self):

# fmt: off
await self.do_call(task, ["bootgen", "-arch", "versal", "-image", self.prepend_tmp("design.bif"), "-o", self.prepend_tmp("design.pdi"), "-w"])
await self.do_call(task, ["xclbinutil", "--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json"), "--add-kernel", self.prepend_tmp("kernels.json"), "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), "--force", "--output", opts.xclbin_name])
if opts.xclbin_input:
await self.do_call(task, ["xclbinutil",
"--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"),
"--force", "--input", opts.xclbin_input])
with open(self.prepend_tmp("aie_input_partition.json")) as f:
input_partition = json.load(f)
with open(self.prepend_tmp("aie_partition.json")) as f:
new_partition = json.load(f)
input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
with open(self.prepend_tmp("aie_partition.json"), "w") as f:
json.dump(input_partition, f, indent=2)
flag = ['--input', opts.xclbin_input]
else:
flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json")]

await self.do_call(task, ["xclbinutil"] + flag +
["--add-kernel", self.prepend_tmp("kernels.json"),
"--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"),
"--force", "--output", opts.xclbin_name])
# fmt: on

async def process_host_cgen(self, aie_target, file_with_addresses):
Expand Down
53 changes: 53 additions & 0 deletions test/npu-xrt/add_one_two/aie1.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

module {
aie.device(npu1_1col) {
%t00 = aie.tile(0, 0)
%t01 = aie.tile(0, 1)
%t02 = aie.tile(0, 2)

aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()

aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()

aie.core(%t02) {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c1_32 = arith.constant 1 : i32

scf.for %steps = %c0 to %c8 step %c1 {
%subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
%subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
scf.for %arg3 = %c0 to %c8 step %c1 {
%0 = memref.load %elem0[%arg3] : memref<8xi32>
%1 = arith.addi %0, %c1_32 : i32
memref.store %1, %elem1[%arg3] : memref<8xi32>
}
aie.objectfifo.release @objFifo_in1(Consume, 1)
aie.objectfifo.release @objFifo_out1(Produce, 1)
}
aie.end
}
func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c64 = arith.constant 64 : i64
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
return
}
}
}
53 changes: 53 additions & 0 deletions test/npu-xrt/add_one_two/aie2.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

module {
aie.device(npu1_1col) {
%t00 = aie.tile(0, 0)
%t01 = aie.tile(0, 1)
%t02 = aie.tile(0, 2)

aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()

aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()

aie.core(%t02) {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2_32 = arith.constant 2 : i32

scf.for %steps = %c0 to %c8 step %c1 {
%subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
%subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
scf.for %arg3 = %c0 to %c8 step %c1 {
%0 = memref.load %elem0[%arg3] : memref<8xi32>
%1 = arith.addi %0, %c2_32 : i32
memref.store %1, %elem1[%arg3] : memref<8xi32>
}
aie.objectfifo.release @objFifo_in1(Consume, 1)
aie.objectfifo.release @objFifo_out1(Produce, 1)
}
aie.end
}
func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c64 = arith.constant 64 : i64
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
return
}
}
}
11 changes: 11 additions & 0 deletions test/npu-xrt/add_one_two/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// (c) Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=insts.txt %S/aie1.mlir
// RUN: %python aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-input=add_one.xclbin --xclbin-name=add_two.xclbin --npu-insts-name=insts.txt %S/aie2.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_npu ./test.exe -x add_two.xclbin -i insts.txt | FileCheck %s
// CHECK: PASS!

221 changes: 221 additions & 0 deletions test/npu-xrt/add_one_two/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
//===- test.cpp -------------------------------------------000---*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#include <boost/program_options.hpp>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

constexpr int IN_SIZE = 64;
constexpr int OUT_SIZE = 64;

namespace po = boost::program_options;

void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
if (!vm_in.count(name)) {
throw std::runtime_error("Error: no " + name + " file was provided\n");
} else {
std::ifstream test(vm_in[name].as<std::string>());
if (!test) {
throw std::runtime_error("The " + name + " file " +
vm_in[name].as<std::string>() +
" does not exist.\n");
}
}
}

std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
std::ifstream instr_file(instr_path);
std::string line;
std::vector<uint32_t> instr_v;
while (std::getline(instr_file, line)) {
std::istringstream iss(line);
uint32_t a;
if (!(iss >> std::hex >> a)) {
throw std::runtime_error("Unable to parse instruction file\n");
}
instr_v.push_back(a);
}
return instr_v;
}

int main(int argc, const char *argv[]) {

// Program arguments parsing
po::options_description desc("Allowed options");
desc.add_options()("help,h", "produce help message")(
"xclbin,x", po::value<std::string>()->required(),
"the input xclbin path")("verbosity,v",
po::value<int>()->default_value(0),
"the verbosity of the output")(
"instr,i", po::value<std::string>()->required(),
"path of file containing userspace instructions to be sent to the LX6");
po::variables_map vm;

try {
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 1;
}
} catch (const std::exception &ex) {
std::cerr << ex.what() << "\n\n";
std::cerr << "Usage:\n" << desc << "\n";
return 1;
}

check_arg_file_exists(vm, "xclbin");
check_arg_file_exists(vm, "instr");

std::vector<uint32_t> instr_v =
load_instr_sequence(vm["instr"].as<std::string>());

int verbosity = vm["verbosity"].as<int>();
if (verbosity >= 1)
std::cout << "Sequence instr count: " << instr_v.size() << "\n";

// Start the XRT test code
// Get a device handle
unsigned int device_index = 0;
auto device = xrt::device(device_index);

// Load the xclbin
if (verbosity >= 1)
std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());

// Get the kernel from the xclbin
auto xkernels = xclbin.get_kernels();
auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(),
[](xrt::xclbin::kernel &k) {
auto name = k.get_name();
std::cout << "Name: " << name << std::endl;
return name == "ADDONE";
});
auto kernelName0 = xkernel0.get_name();
auto xkernel1 = *std::find_if(xkernels.begin(), xkernels.end(),
[](xrt::xclbin::kernel &k) {
auto name = k.get_name();
std::cout << "Name: " << name << std::endl;
return name == "ADDTWO";
});
auto kernelName1 = xkernel1.get_name();

if (verbosity >= 1)
std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
<< "\n";

device.register_xclbin(xclbin);

// get a hardware context
if (verbosity >= 1)
std::cout << "Getting hardware context.\n";
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
if (verbosity >= 1)
std::cout << "Getting handle to kernels: " << kernelName0 << " and "
<< kernelName1 << "\n";

auto kernel0 = xrt::kernel(context, kernelName0);

auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0));
auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2));
auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3));
auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4));

auto kernel1 = xrt::kernel(context, kernelName1);

auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0));
auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2));
auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3));
auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4));

if (verbosity >= 1)
std::cout << "Writing data into buffer objects.\n";

uint32_t *bufInA = bo0_inA.map<uint32_t *>();
std::vector<uint32_t> srcVecA;
for (int i = 0; i < IN_SIZE; i++)
srcVecA.push_back(i + 1);
memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));

void *bufInstr = bo0_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

bo0_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo0_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);

if (verbosity >= 1)
std::cout << "Running Kernel 0.\n";

auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out);
run0.wait();

bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
uint32_t *bufOut = bo0_out.map<uint32_t *>();

// same instructions as kernel1
bufInstr = bo1_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
bo1_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);

// copy kernel0 output to kernel1 input
bufInA = bo1_inA.map<uint32_t *>();
memcpy(bufInA, bufOut, IN_SIZE * sizeof(uint32_t));
bo1_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);

if (verbosity >= 1)
std::cout << "Running Kernel 1.\n";
auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out);
run1.wait();

bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
bufOut = bo1_out.map<uint32_t *>();

int errors = 0;

for (uint32_t i = 0; i < 64; i++) {
uint32_t ref = (i + 1) + 1 + 2;
if (*(bufOut + i) != ref) {
std::cout << "Error in output " << *(bufOut + i) << " != " << ref
<< std::endl;
errors++;
} else {
std::cout << "Correct output " << *(bufOut + i) << " == " << ref
<< std::endl;
}
}

if (!errors) {
std::cout << "\nPASS!\n\n";
return 0;
} else {
std::cout << "\nfailed.\n\n";
return 1;
}
}

0 comments on commit b693d4e

Please sign in to comment.