Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC multiple kernels in one xclbin #1508

Merged
merged 3 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/compiler/aiecc/cl_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,12 @@ def parse_args(args=None):
const=True,
help="Generate xclbin",
)
parser.add_argument(
"--xclbin-input",
dest="xclbin_input",
default=None,
help="Generate kernel into existing xclbin file",
)
parser.add_argument(
"--link_against_hsa",
dest="link_against_hsa",
Expand Down
20 changes: 19 additions & 1 deletion python/compiler/aiecc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,25 @@ async def process_xclbin_gen(self):

# fmt: off
await self.do_call(task, ["bootgen", "-arch", "versal", "-image", self.prepend_tmp("design.bif"), "-o", self.prepend_tmp("design.pdi"), "-w"])
await self.do_call(task, ["xclbinutil", "--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json"), "--add-kernel", self.prepend_tmp("kernels.json"), "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), "--force", "--output", opts.xclbin_name])
if opts.xclbin_input:
await self.do_call(task, ["xclbinutil",
"--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"),
"--force", "--input", opts.xclbin_input])
with open(self.prepend_tmp("aie_input_partition.json")) as f:
input_partition = json.load(f)
with open(self.prepend_tmp("aie_partition.json")) as f:
new_partition = json.load(f)
input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
with open(self.prepend_tmp("aie_partition.json"), "w") as f:
json.dump(input_partition, f, indent=2)
flag = ['--input', opts.xclbin_input]
else:
flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json")]

await self.do_call(task, ["xclbinutil"] + flag +
["--add-kernel", self.prepend_tmp("kernels.json"),
"--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"),
"--force", "--output", opts.xclbin_name])
# fmt: on

async def process_host_cgen(self, aie_target, file_with_addresses):
Expand Down
53 changes: 53 additions & 0 deletions test/npu-xrt/add_one_two/aie1.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

module {
aie.device(npu1_1col) {
%t00 = aie.tile(0, 0)
%t01 = aie.tile(0, 1)
%t02 = aie.tile(0, 2)

aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()

aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()

aie.core(%t02) {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c1_32 = arith.constant 1 : i32

scf.for %steps = %c0 to %c8 step %c1 {
%subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
%subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
scf.for %arg3 = %c0 to %c8 step %c1 {
%0 = memref.load %elem0[%arg3] : memref<8xi32>
%1 = arith.addi %0, %c1_32 : i32
memref.store %1, %elem1[%arg3] : memref<8xi32>
}
aie.objectfifo.release @objFifo_in1(Consume, 1)
aie.objectfifo.release @objFifo_out1(Produce, 1)
}
aie.end
}
func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c64 = arith.constant 64 : i64
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
return
}
}
}
53 changes: 53 additions & 0 deletions test/npu-xrt/add_one_two/aie2.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

module {
aie.device(npu1_1col) {
%t00 = aie.tile(0, 0)
%t01 = aie.tile(0, 1)
%t02 = aie.tile(0, 2)

aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()

aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()

aie.core(%t02) {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2_32 = arith.constant 2 : i32

scf.for %steps = %c0 to %c8 step %c1 {
%subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
%subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
%elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
scf.for %arg3 = %c0 to %c8 step %c1 {
%0 = memref.load %elem0[%arg3] : memref<8xi32>
%1 = arith.addi %0, %c2_32 : i32
memref.store %1, %elem1[%arg3] : memref<8xi32>
}
aie.objectfifo.release @objFifo_in1(Consume, 1)
aie.objectfifo.release @objFifo_out1(Produce, 1)
}
aie.end
}
func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
%c0 = arith.constant 0 : i64
%c1 = arith.constant 1 : i64
%c64 = arith.constant 64 : i64
aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
return
}
}
}
11 changes: 11 additions & 0 deletions test/npu-xrt/add_one_two/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// (c) Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=insts.txt %S/aie1.mlir
// RUN: %python aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-input=add_one.xclbin --xclbin-name=add_two.xclbin --npu-insts-name=insts.txt %S/aie2.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_npu ./test.exe -x add_two.xclbin -i insts.txt | FileCheck %s
// CHECK: PASS!

221 changes: 221 additions & 0 deletions test/npu-xrt/add_one_two/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
//===- test.cpp -------------------------------------------000---*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#include <boost/program_options.hpp>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

constexpr int IN_SIZE = 64;
constexpr int OUT_SIZE = 64;

namespace po = boost::program_options;

void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
if (!vm_in.count(name)) {
throw std::runtime_error("Error: no " + name + " file was provided\n");
} else {
std::ifstream test(vm_in[name].as<std::string>());
if (!test) {
throw std::runtime_error("The " + name + " file " +
vm_in[name].as<std::string>() +
" does not exist.\n");
}
}
}

std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
std::ifstream instr_file(instr_path);
std::string line;
std::vector<uint32_t> instr_v;
while (std::getline(instr_file, line)) {
std::istringstream iss(line);
uint32_t a;
if (!(iss >> std::hex >> a)) {
throw std::runtime_error("Unable to parse instruction file\n");
}
instr_v.push_back(a);
}
return instr_v;
}

int main(int argc, const char *argv[]) {

// Program arguments parsing
po::options_description desc("Allowed options");
desc.add_options()("help,h", "produce help message")(
"xclbin,x", po::value<std::string>()->required(),
"the input xclbin path")("verbosity,v",
po::value<int>()->default_value(0),
"the verbosity of the output")(
"instr,i", po::value<std::string>()->required(),
"path of file containing userspace instructions to be sent to the LX6");
po::variables_map vm;

try {
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 1;
}
} catch (const std::exception &ex) {
std::cerr << ex.what() << "\n\n";
std::cerr << "Usage:\n" << desc << "\n";
return 1;
}

check_arg_file_exists(vm, "xclbin");
check_arg_file_exists(vm, "instr");

std::vector<uint32_t> instr_v =
load_instr_sequence(vm["instr"].as<std::string>());

int verbosity = vm["verbosity"].as<int>();
if (verbosity >= 1)
std::cout << "Sequence instr count: " << instr_v.size() << "\n";

// Start the XRT test code
// Get a device handle
unsigned int device_index = 0;
auto device = xrt::device(device_index);

// Load the xclbin
if (verbosity >= 1)
std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());

// Get the kernel from the xclbin
auto xkernels = xclbin.get_kernels();
auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(),
[](xrt::xclbin::kernel &k) {
auto name = k.get_name();
std::cout << "Name: " << name << std::endl;
return name == "ADDONE";
});
auto kernelName0 = xkernel0.get_name();
auto xkernel1 = *std::find_if(xkernels.begin(), xkernels.end(),
[](xrt::xclbin::kernel &k) {
auto name = k.get_name();
std::cout << "Name: " << name << std::endl;
return name == "ADDTWO";
});
auto kernelName1 = xkernel1.get_name();

if (verbosity >= 1)
std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
<< "\n";

device.register_xclbin(xclbin);

// get a hardware context
if (verbosity >= 1)
std::cout << "Getting hardware context.\n";
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
if (verbosity >= 1)
std::cout << "Getting handle to kernels: " << kernelName0 << " and "
<< kernelName1 << "\n";

auto kernel0 = xrt::kernel(context, kernelName0);

auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0));
auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2));
auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3));
auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4));

auto kernel1 = xrt::kernel(context, kernelName1);

auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0));
auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2));
auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3));
auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4));

if (verbosity >= 1)
std::cout << "Writing data into buffer objects.\n";

uint32_t *bufInA = bo0_inA.map<uint32_t *>();
std::vector<uint32_t> srcVecA;
for (int i = 0; i < IN_SIZE; i++)
srcVecA.push_back(i + 1);
memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));

void *bufInstr = bo0_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

bo0_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo0_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);

if (verbosity >= 1)
std::cout << "Running Kernel 0.\n";

auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out);
run0.wait();

bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
uint32_t *bufOut = bo0_out.map<uint32_t *>();

// same instructions as kernel1
bufInstr = bo1_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
bo1_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);

// copy kernel0 output to kernel1 input
bufInA = bo1_inA.map<uint32_t *>();
memcpy(bufInA, bufOut, IN_SIZE * sizeof(uint32_t));
bo1_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);

if (verbosity >= 1)
std::cout << "Running Kernel 1.\n";
auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out);
run1.wait();

bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
bufOut = bo1_out.map<uint32_t *>();

int errors = 0;

for (uint32_t i = 0; i < 64; i++) {
uint32_t ref = (i + 1) + 1 + 2;
if (*(bufOut + i) != ref) {
std::cout << "Error in output " << *(bufOut + i) << " != " << ref
<< std::endl;
errors++;
} else {
std::cout << "Correct output " << *(bufOut + i) << " == " << ref
<< std::endl;
}
}

if (!errors) {
std::cout << "\nPASS!\n\n";
return 0;
} else {
std::cout << "\nfailed.\n\n";
return 1;
}
}
Loading