Xilinx · fifield · May 28, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
@@ -239,6 +239,12 @@ def parse_args(args=None):
         const=True,
         help="Generate xclbin",
     )
+    parser.add_argument(
+        "--xclbin-input",
+        dest="xclbin_input",
+        default=None,
+        help="Generate kernel into existing xclbin file",
+    )
     parser.add_argument(
         "--link_against_hsa",
         dest="link_against_hsa",

@@ -589,7 +589,25 @@ async def process_xclbin_gen(self):
 
         # fmt: off
         await self.do_call(task, ["bootgen", "-arch", "versal", "-image", self.prepend_tmp("design.bif"), "-o", self.prepend_tmp("design.pdi"), "-w"])
-        await self.do_call(task, ["xclbinutil", "--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json"), "--add-kernel", self.prepend_tmp("kernels.json"), "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), "--force", "--output", opts.xclbin_name])
+        if opts.xclbin_input:
+            await self.do_call(task, ["xclbinutil",
+                                      "--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"),
+                                      "--force", "--input", opts.xclbin_input])
+            with open(self.prepend_tmp("aie_input_partition.json")) as f:
+                input_partition = json.load(f)
+            with open(self.prepend_tmp("aie_partition.json")) as f:
+                new_partition = json.load(f)
+            input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
+            with open(self.prepend_tmp("aie_partition.json"), "w") as f:
+                json.dump(input_partition, f, indent=2)
+            flag = ['--input', opts.xclbin_input]
+        else:
+            flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json")]
+
+        await self.do_call(task, ["xclbinutil"] + flag +
+                                 ["--add-kernel", self.prepend_tmp("kernels.json"),
+                                  "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"),
+                                  "--force", "--output", opts.xclbin_name])
         # fmt: on
 
     async def process_host_cgen(self, aie_target, file_with_addresses):

@@ -0,0 +1,53 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    %t00 = aie.tile(0, 0)
+    %t01 = aie.tile(0, 1)
+    %t02 = aie.tile(0, 2)
+
+    aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()
+
+    aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()
+
+    aie.core(%t02) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1_32 = arith.constant 1 : i32
+
+      scf.for %steps = %c0 to %c8 step %c1 {
+        %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        scf.for %arg3 = %c0 to %c8 step %c1 {
+            %0 = memref.load %elem0[%arg3] : memref<8xi32>
+            %1 = arith.addi %0, %c1_32 : i32
+            memref.store %1, %elem1[%arg3] : memref<8xi32>
+        }
+        aie.objectfifo.release @objFifo_in1(Consume, 1)
+        aie.objectfifo.release @objFifo_out1(Produce, 1)
+      }
+      aie.end
+    }
+    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      return
+    }
+  }
+}
@@ -0,0 +1,53 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    %t00 = aie.tile(0, 0)
+    %t01 = aie.tile(0, 1)
+    %t02 = aie.tile(0, 2)
+
+    aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()
+
+    aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()
+
+    aie.core(%t02) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2_32 = arith.constant 2 : i32
+
+      scf.for %steps = %c0 to %c8 step %c1 {
+        %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        scf.for %arg3 = %c0 to %c8 step %c1 {
+            %0 = memref.load %elem0[%arg3] : memref<8xi32>
+            %1 = arith.addi %0, %c2_32 : i32
+            memref.store %1, %elem1[%arg3] : memref<8xi32>
+        }
+        aie.objectfifo.release @objFifo_in1(Consume, 1)
+        aie.objectfifo.release @objFifo_out1(Produce, 1)
+      }
+      aie.end
+    }
+    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      return
+    }
+  }
+}
@@ -0,0 +1,11 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=insts.txt %S/aie1.mlir
+// RUN: %python aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-input=add_one.xclbin --xclbin-name=add_two.xclbin --npu-insts-name=insts.txt %S/aie2.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x add_two.xclbin -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
@@ -0,0 +1,221 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")("verbosity,v",
+                               po::value<int>()->default_value(0),
+                               "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(),
+                                [](xrt::xclbin::kernel &k) {
+                                  auto name = k.get_name();
+                                  std::cout << "Name: " << name << std::endl;
+                                  return name == "ADDONE";
+                                });
+  auto kernelName0 = xkernel0.get_name();
+  auto xkernel1 = *std::find_if(xkernels.begin(), xkernels.end(),
+                                [](xrt::xclbin::kernel &k) {
+                                  auto name = k.get_name();
+                                  std::cout << "Name: " << name << std::endl;
+                                  return name == "ADDTWO";
+                                });
+  auto kernelName1 = xkernel1.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernels: " << kernelName0 << " and "
+              << kernelName1 << "\n";
+
+  auto kernel0 = xrt::kernel(context, kernelName0);
+
+  auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0));
+  auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2));
+  auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3));
+  auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4));
+
+  auto kernel1 = xrt::kernel(context, kernelName1);
+
+  auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0));
+  auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2));
+  auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3));
+  auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  uint32_t *bufInA = bo0_inA.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(i + 1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+  void *bufInstr = bo0_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo0_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo0_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel 0.\n";
+
+  auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out);
+  run0.wait();
+
+  bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo0_out.map<uint32_t *>();
+
+  // same instructions as kernel1
+  bufInstr = bo1_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+  bo1_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // copy kernel0 output to kernel1 input
+  bufInA = bo1_inA.map<uint32_t *>();
+  memcpy(bufInA, bufOut, IN_SIZE * sizeof(uint32_t));
+  bo1_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel 1.\n";
+  auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out);
+  run1.wait();
+
+  bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  bufOut = bo1_out.map<uint32_t *>();
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < 64; i++) {
+    uint32_t ref = (i + 1) + 1 + 2;
+    if (*(bufOut + i) != ref) {
+      std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+                << std::endl;
+      errors++;
+    } else {
+      std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+                << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nfailed.\n\n";
+    return 1;
+  }
+}