diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 1b7bb8a816..648b067e4a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -19,11 +19,9 @@ programming_examples/ @denolf @jgmelber @fifield python/ @makslevental @AndraBisca reference_designs/ @denolf @jgmelber @fifield runtime_lib/ @stephenneuendorffer -test/aie2xclbin/ @erieaton-amd @newling test/aievec/ @jsetoain test/python/ @makslevental @AndraBisca test/objectFifo*/ @AndraBisca tools/ @stephenneuendorffer @makslevental -tools/aie2xclbin/ @erieaton-amd @newling tutorials/ @denolf @jackl-xilinx utils/ @makslevental @jackl-xilinx @jgmelber diff --git a/test/aie2xclbin/buffers_xclbin.mlir b/test/aie2xclbin/buffers_xclbin.mlir deleted file mode 100644 index 0d46ecbc8b..0000000000 --- a/test/aie2xclbin/buffers_xclbin.mlir +++ /dev/null @@ -1,135 +0,0 @@ -//===- buffers_xclbin.mlir --------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 AMD Inc. -// -//===----------------------------------------------------------------------===// - -// REQUIRES: peano - -// RUN: aie2xclbin -v --host-target=aarch64-linux-gnu --peano=%PEANO_INSTALL_DIR %s --tmpdir=%T/buffers_xclbin.mlir.prj --xclbin-name=test.xclbin -// RUN: FileCheck %s --input-file=%T/buffers_xclbin.mlir.prj/kernels.json - -// CHECK: { -// CHECK: "ps-kernels": { -// CHECK: "kernels": [ -// CHECK: { -// CHECK: "arguments": [ -// CHECK: { -// CHECK: "address-qualifier": "SCALAR", -// CHECK: "name": "opcode", -// CHECK: "offset": "0x00", -// CHECK: "type": "uint64_t" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "SRAM", -// CHECK: "name": "instr", -// CHECK: "offset": "0x08", -// CHECK: "type": "char *" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "SCALAR", -// CHECK: "name": "ninstr", -// CHECK: "offset": "0x10", -// CHECK: "type": "uint32_t" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo0", -// CHECK: "offset": "0x14", -// CHECK: "type": "void*" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo1", -// CHECK: "offset": "0x1c", -// CHECK: "type": "void*" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo2", -// CHECK: "offset": "0x24", -// CHECK: "type": "void*" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo3", -// CHECK: "offset": "0x2c", -// CHECK: "type": "void*" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo4", -// CHECK: "offset": "0x34", -// CHECK: "type": "void*" -// CHECK: }, -// CHECK: { -// CHECK: "address-qualifier": "GLOBAL", -// CHECK: "memory-connection": "HOST", -// CHECK: "name": "bo5", -// CHECK: "offset": "0x3c", -// CHECK: "type": "void*" -// CHECK: } -// CHECK: ], -// CHECK: "extended-data": { -// CHECK: "dpu_kernel_id": "0x901", -// CHECK: "functional": "0", -// CHECK: "subtype": "DPU" -// CHECK: }, -// CHECK: "instances": [ -// CHECK: { -// CHECK: "name": "MLIRAIEV1" -// CHECK: } -// CHECK: ], -// CHECK: "name": "MLIR_AIE", -// CHECK: "type": "dpu" -// CHECK: } -// CHECK: ] -// CHECK: } -// CHECK: } - -module { - aie.device(npu1_4col) { - memref.global "public" @in0 : memref<1024xi32> - memref.global "public" @out0 : memref<1024xi32> - memref.global "public" @in1 : memref<1024xi32> - memref.global "public" @out1 : memref<1024xi32> - memref.global "public" @in2 : memref<1024xi32> - memref.global "public" @out2 : memref<1024xi32> - %02 = aie.tile(0, 2) - %12 = aie.tile(1, 2) - %22 = aie.tile(2, 2) - - aie.core(%12) { - aie.end - } - aie.shim_dma_allocation @in0(MM2S, 0, 0) - aie.shim_dma_allocation @out0(S2MM, 0, 0) - aie.shim_dma_allocation @in1(MM2S, 1, 0) - aie.shim_dma_allocation @out1(S2MM, 1, 0) - aie.shim_dma_allocation @in2(MM2S, 2, 0) - aie.shim_dma_allocation @out2(S2MM, 2, 0) - - func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, metadata = @in0} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 1 : i64, metadata = @out0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in1} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out1} : memref<1024xi32> - aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd (0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out2} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} \ No newline at end of file diff --git a/test/aie2xclbin/simple_xclbin.mlir b/test/aie2xclbin/simple_xclbin.mlir deleted file mode 100644 index bd4f32a10c..0000000000 --- a/test/aie2xclbin/simple_xclbin.mlir +++ /dev/null @@ -1,32 +0,0 @@ -//===- simple.mlir ---------------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2023 Xilinx Inc. -// -//===----------------------------------------------------------------------===// - -// RUN: aie2xclbin -v --host-target=aarch64-linux-gnu --peano=%PEANO_INSTALL_DIR %s --xclbin-name=test.xclbin | FileCheck %s --check-prefix=PEANO -// REQUIRES: peano - -// Note that llc determines the architecture from the llvm IR. -// PEANO-NOT: xchesscc_wrapper -// PEANO: llc -// PEANO: bootgen -// PEANO: xclbinutil -// PEANO-NOT: xchesscc_wrapper - -module { - aie.device(npu1_4col) { - %12 = aie.tile(1, 2) - %buf = aie.buffer(%12) : memref<256xi32> - %4 = aie.core(%12) { - %0 = arith.constant 0 : i32 - %1 = arith.constant 0 : index - memref.store %0, %buf[%1] : memref<256xi32> - aie.end - } - } -} diff --git a/test/aie2xclbin/simple_xclbin_chess.mlir b/test/aie2xclbin/simple_xclbin_chess.mlir deleted file mode 100644 index 5f27331b27..0000000000 --- a/test/aie2xclbin/simple_xclbin_chess.mlir +++ /dev/null @@ -1,32 +0,0 @@ -//===- simple.mlir ---------------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2023 Xilinx Inc. -// -//===----------------------------------------------------------------------===// - -// RUN: aie2xclbin -v --use-chess --host-target=aarch64-linux-gnu %s --xclbin-name=test.xclbin | FileCheck %s -// REQUIRES: valid_xchess_license, xrt - -// Note that llc determines the architecture from the llvm IR. -// CHECK-NOT: llc -// CHECK: xchesscc_wrapper -// CHECK: bootgen -// CHECK: xclbinutil -// CHECK-NOT: llc - -module { - aie.device(npu1_4col) { - %12 = aie.tile(1, 2) - %buf = aie.buffer(%12) : memref<256xi32> - %4 = aie.core(%12) { - %0 = arith.constant 0 : i32 - %1 = arith.constant 0 : index - memref.store %0, %buf[%1] : memref<256xi32> - aie.end - } - } -} diff --git a/test/lit.cfg.py b/test/lit.cfg.py index f2a5deb560..5174b4868e 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -279,7 +279,6 @@ def prepend_path(path): tools = [ "aie-opt", "aie-translate", - "aie2xclbin", "aiecc.py", "ld.lld", "llc", diff --git a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir deleted file mode 100644 index ba0cf3fa47..0000000000 --- a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir +++ /dev/null @@ -1,254 +0,0 @@ -//===- aie.mlir ------------------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates -// Copyright (C) 2020-2022, Xilinx Inc. -// -//===----------------------------------------------------------------------===// - -module { - aie.device(npu1_1col) { - memref.global "public" @inA : memref<64x32xi16> - memref.global "public" @inA_cons : memref<64x32xi16> - memref.global "public" @inB : memref<32x64xi16> - memref.global "public" @inB_cons : memref<32x64xi16> - memref.global "public" @aie.memA : memref<64x32xi16> - memref.global "public" @aie.memA_cons : memref<64x32xi16> - memref.global "public" @aie.memB : memref<32x64xi16> - memref.global "public" @aie.memB_cons : memref<32x64xi16> - memref.global "public" @aie.memC : memref<64x64xi16> - memref.global "public" @aie.memC_cons : memref<64x64xi16> - memref.global "public" @outC : memref<64x64xi16> - memref.global "public" @outC_cons : memref<64x64xi16> - - func.func private @matmul_i16_i16(memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) - func.func private @matmul_scalar_i16_i16(memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) - func.func private @zero_i16(memref<64x64xi16>) - func.func private @zero_scalar_i16(memref<64x64xi16>) - - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - - %memA_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "memA_cons_buff_0"} : memref<64x32xi16> - %memA_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "memA_cons_buff_1"} : memref<64x32xi16> - %memB_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "memB_cons_buff_0"} : memref<32x64xi16> - %memB_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "memB_cons_buff_1"} : memref<32x64xi16> - %memC_buff_0 = aie.buffer(%tile_0_2) {sym_name = "memC_buff_0"} : memref<64x64xi16> - %memC_buff_1 = aie.buffer(%tile_0_2) {sym_name = "memC_buff_1"} : memref<64x64xi16> - - %memA_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "memA_cons_cons_lock"} - %memA_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "memA_cons_prod_lock"} - %memB_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "memB_cons_cons_lock"} - %memB_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "memB_cons_prod_lock"} - %memC_cons_cons_lock = aie.lock(%tile_0_1, 5) {init = 0 : i32, sym_name = "memC_cons_cons_lock"} - %memC_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "memC_cons_lock"} - %memC_cons_prod_lock = aie.lock(%tile_0_1, 4) {init = 2 : i32, sym_name = "memC_cons_prod_lock"} - %memC_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "memC_prod_lock"} - - aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_0_0, DMA : 1, %tile_0_1, DMA : 1) - aie.flow(%tile_0_1, DMA : 1, %tile_0_2, DMA : 1) - aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2) - aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0) - - %core_0_2 = aie.core(%tile_0_2) { - %c4 = arith.constant 4 : index - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c2 = arith.constant 2 : index - scf.for %arg1 = %c0 to %c4 step %c2 { - aie.use_lock(%memC_prod_lock, AcquireGreaterEqual) - func.call @zero_i16(%memC_buff_0) : (memref<64x64xi16>) -> () - %c2_0 = arith.constant 2 : index - scf.for %arg2 = %c0 to %c4 step %c2_0 { - aie.use_lock(%memA_cons_cons_lock, AcquireGreaterEqual) - aie.use_lock(%memB_cons_cons_lock, AcquireGreaterEqual) - func.call @matmul_i16_i16(%memA_cons_buff_0, %memB_cons_buff_0, %memC_buff_0) : (memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) -> () - aie.use_lock(%memA_cons_prod_lock, Release) - aie.use_lock(%memB_cons_prod_lock, Release) - - aie.use_lock(%memA_cons_cons_lock, AcquireGreaterEqual) - aie.use_lock(%memB_cons_cons_lock, AcquireGreaterEqual) - func.call @matmul_i16_i16(%memA_cons_buff_1, %memB_cons_buff_1, %memC_buff_0) : (memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) -> () - aie.use_lock(%memA_cons_prod_lock, Release) - aie.use_lock(%memB_cons_prod_lock, Release) - } - aie.use_lock(%memC_cons_lock, Release) - aie.use_lock(%memC_prod_lock, AcquireGreaterEqual) - func.call @zero_i16(%memC_buff_1) : (memref<64x64xi16>) -> () - %c2_1 = arith.constant 2 : index - scf.for %arg2 = %c0 to %c4 step %c2_1 { - aie.use_lock(%memA_cons_cons_lock, AcquireGreaterEqual) - aie.use_lock(%memB_cons_cons_lock, AcquireGreaterEqual) - func.call @matmul_i16_i16(%memA_cons_buff_0, %memB_cons_buff_0, %memC_buff_1) : (memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) -> () - aie.use_lock(%memA_cons_prod_lock, Release) - aie.use_lock(%memB_cons_prod_lock, Release) - - aie.use_lock(%memA_cons_cons_lock, AcquireGreaterEqual) - aie.use_lock(%memB_cons_cons_lock, AcquireGreaterEqual) - func.call @matmul_i16_i16(%memA_cons_buff_1, %memB_cons_buff_1, %memC_buff_1) : (memref<64x32xi16>, memref<32x64xi16>, memref<64x64xi16>) -> () - aie.use_lock(%memA_cons_prod_lock, Release) - aie.use_lock(%memB_cons_prod_lock, Release) - } - aie.use_lock(%memC_cons_lock, Release) - } - } - aie.end - } {link_with = "mm.o"} - - aie.shim_dma_allocation @inA(MM2S, 0, 0) - - func.func @sequence(%arg0: memref<8192xi32>, %arg1: memref<8192xi32>, %arg2: memref<8192xi32>) { - %c2048_i64 = arith.constant 2048 : i64 - %c16_i64 = arith.constant 16 : i64 - %c4_i64 = arith.constant 4 : i64 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 0 : i64 - %c2_i64 = arith.constant 2 : i64 - %c64_i64 = arith.constant 64 : i64 - %c32_i64 = arith.constant 32 : i64 - %c4096_i64 = arith.constant 4096 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64, %c1_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64, %c1_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64, %c1_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64, %c1_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - - %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { - %inA_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "inA_cons_buff_0"} : memref<64x32xi16> - %inA_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "inA_cons_buff_1"} : memref<64x32xi16> - %inB_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "inB_cons_buff_0"} : memref<32x64xi16> - %inB_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "inB_cons_buff_1"} : memref<32x64xi16> - %memC_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "memC_cons_buff_0"} : memref<64x64xi16> - %memC_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "memC_cons_buff_1"} : memref<64x64xi16> - %inA_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "inA_cons_prod_lock"} - %inA_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "inA_cons_cons_lock"} - %inB_cons_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "inB_cons_cons_lock"} - %inB_cons_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "inB_cons_prod_lock"} - %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%inA_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%inA_cons_buff_0 : memref<64x32xi16>, 0, 2048) - aie.use_lock(%inA_cons_cons_lock, Release) - aie.next_bd ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%inA_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%inA_cons_buff_1 : memref<64x32xi16>, 0, 2048) - aie.use_lock(%inA_cons_cons_lock, Release) - aie.next_bd ^bb1 - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) - ^bb4: // 2 preds: ^bb3, ^bb5 - aie.use_lock(%inA_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%inA_cons_buff_0 : memref<64x32xi16>, 0, 2048, [, , , ]) - aie.use_lock(%inA_cons_prod_lock, Release) - aie.next_bd ^bb5 - ^bb5: // pred: ^bb4 - aie.use_lock(%inA_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%inA_cons_buff_1 : memref<64x32xi16>, 0, 2048, [, , , ]) - aie.use_lock(%inA_cons_prod_lock, Release) - aie.next_bd ^bb4 - ^bb6: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb7, ^bb9) - ^bb7: // 2 preds: ^bb6, ^bb8 - aie.use_lock(%inB_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%inB_cons_buff_0 : memref<32x64xi16>, 0, 2048) - aie.use_lock(%inB_cons_cons_lock, Release) - aie.next_bd ^bb8 - ^bb8: // pred: ^bb7 - aie.use_lock(%inB_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%inB_cons_buff_1 : memref<32x64xi16>, 0, 2048) - aie.use_lock(%inB_cons_cons_lock, Release) - aie.next_bd ^bb7 - ^bb9: // pred: ^bb6 - %3 = aie.dma_start(MM2S, 1, ^bb10, ^bb12) - ^bb10: // 2 preds: ^bb9, ^bb11 - aie.use_lock(%inB_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%inB_cons_buff_0 : memref<32x64xi16>, 0, 2048, [, , , ]) - aie.use_lock(%inB_cons_prod_lock, Release) - aie.next_bd ^bb11 - ^bb11: // pred: ^bb10 - aie.use_lock(%inB_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%inB_cons_buff_1 : memref<32x64xi16>, 0, 2048, [, , , ]) - aie.use_lock(%inB_cons_prod_lock, Release) - aie.next_bd ^bb10 - ^bb12: // pred: ^bb9 - %4 = aie.dma_start(S2MM, 2, ^bb13, ^bb15) - ^bb13: // 2 preds: ^bb12, ^bb14 - aie.use_lock(%memC_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_cons_buff_0 : memref<64x64xi16>, 0, 4096) - aie.use_lock(%memC_cons_cons_lock, Release) - aie.next_bd ^bb14 - ^bb14: // pred: ^bb13 - aie.use_lock(%memC_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_cons_buff_1 : memref<64x64xi16>, 0, 4096) - aie.use_lock(%memC_cons_cons_lock, Release) - aie.next_bd ^bb13 - ^bb15: // pred: ^bb12 - %5 = aie.dma_start(MM2S, 2, ^bb16, ^bb18) - ^bb16: // 2 preds: ^bb15, ^bb17 - aie.use_lock(%memC_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_cons_buff_0 : memref<64x64xi16>, 0, 4096, [, , , ]) - aie.use_lock(%memC_cons_prod_lock, Release) - aie.next_bd ^bb17 - ^bb17: // pred: ^bb16 - aie.use_lock(%memC_cons_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_cons_buff_1 : memref<64x64xi16>, 0, 4096, [, , , ]) - aie.use_lock(%memC_cons_prod_lock, Release) - aie.next_bd ^bb16 - ^bb18: // pred: ^bb15 - aie.end - } - aie.shim_dma_allocation @inB(MM2S, 1, 0) - aie.shim_dma_allocation @outC(S2MM, 0, 0) - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%memA_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memA_cons_buff_0 : memref<64x32xi16>, 0, 2048) - aie.use_lock(%memA_cons_cons_lock, Release) - aie.next_bd ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%memA_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memA_cons_buff_1 : memref<64x32xi16>, 0, 2048) - aie.use_lock(%memA_cons_cons_lock, Release) - aie.next_bd ^bb1 - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6) - ^bb4: // 2 preds: ^bb3, ^bb5 - aie.use_lock(%memB_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memB_cons_buff_0 : memref<32x64xi16>, 0, 2048) - aie.use_lock(%memB_cons_cons_lock, Release) - aie.next_bd ^bb5 - ^bb5: // pred: ^bb4 - aie.use_lock(%memB_cons_prod_lock, AcquireGreaterEqual) - aie.dma_bd(%memB_cons_buff_1 : memref<32x64xi16>, 0, 2048) - aie.use_lock(%memB_cons_cons_lock, Release) - aie.next_bd ^bb4 - ^bb6: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9) - ^bb7: // 2 preds: ^bb6, ^bb8 - aie.use_lock(%memC_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_buff_0 : memref<64x64xi16>, 0, 4096) - aie.use_lock(%memC_prod_lock, Release) - aie.next_bd ^bb8 - ^bb8: // pred: ^bb7 - aie.use_lock(%memC_cons_lock, AcquireGreaterEqual) - aie.dma_bd(%memC_buff_1 : memref<64x64xi16>, 0, 4096) - aie.use_lock(%memC_prod_lock, Release) - aie.next_bd ^bb7 - ^bb9: // pred: ^bb6 - aie.end - } - } -} diff --git a/test/npu-xrt/matrix_multiplication_using_dma/mm.cc b/test/npu-xrt/matrix_multiplication_using_dma/mm.cc deleted file mode 100755 index 8f814cf809..0000000000 --- a/test/npu-xrt/matrix_multiplication_using_dma/mm.cc +++ /dev/null @@ -1,216 +0,0 @@ -//===- mm.cc ----------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#define __AIENGINE__ 2 -#define NOCPP -#define __AIEARCH__ 20 - -#include -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -#include "zero.cc" - -template -void matmul_scalar(T_in *a, T_in *b, T_out *c) { - event0(); - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col++) { - T_out running_sum = 0; - for (int i = 0; i < K; i++) { - running_sum += a[row * K + i] * b[i * N + col]; - } - c[row * N + col] += running_sum; - } - } - event1(); -} - -template -void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, - T_out *__restrict pC) { - using MMUL = aie::mmul; - - event0(); - - // For int16 (4x4x4), this implementation iterates over the output space in - // steps of 4x4 tiles; each iteration makes an r*s, s*t and r*t step in the - // input and output space, respectively. The data layout expected is such - // that each r*s/s*t/r*t tile's elements are laid out contiguously in - // row-major order, and tiles themselves are organized in row-major - // order. For example, for 4x4x4 tiles, this means that an element in - // row 1, column 0 would be stored at offset 4 (since the first 4x4 tile - // is laid out contiguously in row-major). An element in row 0, column 4 - // would be stored at offset 16 in the same example. - - for (unsigned z = 0; z < rowA; z += 2) - chess_loop_range(2, ) { - T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C; - T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C; - - for (unsigned j = 0; j < colB; j += 2) - chess_loop_range(2, ) { - const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; - const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; - const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B; - const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B; - - aie::vector A0 = aie::load_v(pA1); - pA1 += MMUL::size_A; - aie::vector A1 = aie::load_v(pA2); - pA2 += MMUL::size_A; - aie::vector B0 = aie::load_v(pB1); - pB1 += MMUL::size_B * colB; - aie::vector B1 = aie::load_v(pB2); - pB2 += MMUL::size_B * colB; - - // We modify the library documentation implementation to accumulate - // in the C dimension, since this vectorized kernel will be called - // multiple times as we further tile the input at a higher level. - aie::vector acc_C00 = - aie::load_v(pC1); - aie::vector acc_C01 = - aie::load_v(pC1 + MMUL::size_C); - aie::vector acc_C10 = - aie::load_v(pC2); - aie::vector acc_C11 = - aie::load_v(pC2 + MMUL::size_C); - - MMUL C00(acc_C00); - MMUL C01(acc_C01); - MMUL C10(acc_C10); - MMUL C11(acc_C11); - - C00.mac(A0, B0); - C01.mac(A0, B1); - C10.mac(A1, B0); - C11.mac(A1, B1); - - for (unsigned i = 1; i < colA; ++i) - chess_prepare_for_pipelining chess_loop_range(3, ) { - A0 = aie::load_v(pA1); - pA1 += MMUL::size_A; - A1 = aie::load_v(pA2); - pA2 += MMUL::size_A; - B0 = aie::load_v(pB1); - pB1 += MMUL::size_B * colB; - B1 = aie::load_v(pB2); - pB2 += MMUL::size_B * colB; - - C00.mac(A0, B0); - C01.mac(A0, B1); - C10.mac(A1, B0); - C11.mac(A1, B1); - } - - aie::store_v(pC1, C00.template to_vector()); - pC1 += MMUL::size_C; - aie::store_v(pC1, C01.template to_vector()); - pC1 += MMUL::size_C; - aie::store_v(pC2, C10.template to_vector()); - pC2 += MMUL::size_C; - aie::store_v(pC2, C11.template to_vector()); - pC2 += MMUL::size_C; - } - } - - event1(); -} - -template -void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA, - const int16 *__restrict pB, - int16 *__restrict pC) { - // matmul_vectorized operates on two 4x4 input blocks of A, and two 4x4 input - // blocks of B in each iteration. Make sure we have at least 2 blocks in each - // dimension, and that our input matrix is evenly divisible. - constexpr int r = 4; - constexpr int s = 4; - constexpr int t = 4; - static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); - static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); - static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); - return matmul_vectorized(pA, pB, - pC); -} - -template -void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA, - const bfloat16 *__restrict pB, - bfloat16 *__restrict pC) { - constexpr int r = 4; - constexpr int s = 8; - constexpr int t = 4; - static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); - static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); - static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); - return matmul_vectorized( - pA, pB, pC); -} - -template -void matmul_vectorized_4x8x4_bf16_f32(const bfloat16 *__restrict pA, - const bfloat16 *__restrict pB, - float *__restrict pC) { - constexpr int r = 4; - constexpr int s = 8; - constexpr int t = 4; - static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); - static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); - static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); - return matmul_vectorized( - pA, pB, pC); -} - -extern "C" { - -#define combos(X) \ - X(int16, i16, int16, i16, 4, 4, 4) \ - X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) \ - X(bfloat16, bf16, float, f32, 4, 8, 4) - -#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ - mlir_type_out, r, s, t) \ - void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \ - ctype_out *c_out) { \ - matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out< \ - 64, 32, 64>(a_in, b_in, c_out); \ - } - -#define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ - r, s, t) \ - void matmul_scalar_##mlir_type_in##_##mlir_type_out( \ - ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \ - matmul_scalar(a_in, b_in, c_out); \ - } - -#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ - mlir_type_out, r, s, t) \ - void zero_##mlir_type_out(ctype_out *c_out) { \ - zero_vectorized(c_out); \ - } - -#define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \ - r, s, t) \ - void zero_scalar_##mlir_type_out(ctype_out *c_out) { \ - zero_scalar(c_out); \ - } - -combos(matmul_vectorized_c_func) combos(matmul_scalar_c_func) - combos(zero_vectorized_c_func) combos(zero_scalar_c_func) - -} // extern "C" diff --git a/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit b/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit deleted file mode 100644 index 33880c807b..0000000000 --- a/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2023 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai, chess -// -// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o -// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --npu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir -// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s -// CHECK: PASS! diff --git a/test/npu-xrt/matrix_multiplication_using_dma/test.cpp b/test/npu-xrt/matrix_multiplication_using_dma/test.cpp deleted file mode 100644 index 1316c5d4d1..0000000000 --- a/test/npu-xrt/matrix_multiplication_using_dma/test.cpp +++ /dev/null @@ -1,241 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -constexpr int M = 128; -constexpr int K = 128; -constexpr int N = 128; - -constexpr int A_VOLUME = M * K; -constexpr int B_VOLUME = N * K; -constexpr int C_VOLUME = M * N; - -using A_DATATYPE = std::int16_t; -using B_DATATYPE = std::int16_t; -using C_DATATYPE = std::int16_t; - -constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE)); -constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE)); -constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE)); - -namespace po = boost::program_options; - -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} - -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} - -static inline std::int16_t random_int16_t() { - return ((std::int16_t)rand() % 0x10000); -} - -template -void matmul(std::vector a, std::vector b, std::vector &c) { - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col++) { - Tout running_sum = 0; - for (int i = 0; i < K; i++) { - running_sum += a[row * K + i] * b[i * N + col]; - } - c[row * N + col] += running_sum; - } - } -} - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); - po::variables_map vm; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT test code - // Get a device handle - unsigned int device_index = 0; - auto device = xrt::device(device_index); - - // Load the xclbin - if (verbosity >= 1) - std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; - auto xclbin = xrt::xclbin(vm["xclbin"].as()); - - if (verbosity >= 1) - std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; - std::string Node = vm["kernel"].as(); - - // Get the kernel from the xclbin - auto xkernels = xclbin.get_kernels(); - auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { - auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; - return name.rfind(Node, 0) == 0; - }); - auto kernelName = xkernel.get_name(); - - if (verbosity >= 1) - std::cout << "Registering xclbin: " << vm["xclbin"].as() - << "\n"; - - device.register_xclbin(xclbin); - - // get a hardware context - if (verbosity >= 1) - std::cout << "Getting hardware context.\n"; - xrt::hw_context context(device, xclbin.get_uuid()); - - // get a kernel handle - if (verbosity >= 1) - std::cout << "Getting handle to kernel:" << kernelName << "\n"; - auto kernel = xrt::kernel(context, kernelName); - - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_a = - xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_b = - xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_c = - xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - srand(static_cast(time(0))); - A_DATATYPE *bufA = bo_a.map(); - std::vector AVec; - for (int i = 0; i < A_VOLUME; i++) - AVec.push_back(random_int16_t()); - memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); - B_DATATYPE *bufB = bo_b.map(); - std::vector BVec; - for (int i = 0; i < B_VOLUME; i++) - BVec.push_back(random_int16_t()); - memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); - C_DATATYPE *bufC = bo_c.map(); - std::vector CVec; - for (int i = 0; i < C_VOLUME; i++) - CVec.push_back(0); - memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE))); - - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); - run.wait(); - - bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - C_DATATYPE *bufOut = bo_c.map(); - - int errors = 0; - int max_errors = 100; - - std::vector output_ref0; - for (uint32_t i = 0; i < C_VOLUME; i++) - output_ref0.push_back(0); - matmul(AVec, BVec, output_ref0); - - for (uint32_t i = 0; i < C_VOLUME; i++) { - if (bufOut[i] != output_ref0[i]) { - errors++; - if (errors < max_errors) { - std::cout << "\nerror, id " << i << " expected " - << std::to_string(output_ref0[i]) << ", got " - << std::to_string(bufOut[i]) << "\n"; - } - } - } - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nerror count: " << errors << "\n\n"; - std::cout << "\nfailed.\n\n"; - return 1; - } -} \ No newline at end of file diff --git a/test/npu-xrt/matrix_multiplication_using_dma/zero.cc b/test/npu-xrt/matrix_multiplication_using_dma/zero.cc deleted file mode 100644 index 933e8f0bc0..0000000000 --- a/test/npu-xrt/matrix_multiplication_using_dma/zero.cc +++ /dev/null @@ -1,40 +0,0 @@ -//===- zero.cc --------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#ifndef ZERO_CC -#define ZERO_CC - -#include -#include -#include -#include - -template -void zero_scalar(T *__restrict c) { - for (int i = 0; i < M * N; i++) { - c[i] = 0; - } -} - -template -void zero_vectorized(T *__restrict c) { - const aie::vector zeros = aie::zeros(); - const T *__restrict c_end = c + M * N; - for (; c + r < c_end; c += r) { - aie::store_v(c, zeros); - } - // Do a scalar write for any remainder not divisible by vector instruction - // size r - for (; c < c_end; c++) { - *c = 0; - } -} - -#endif diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 65940ee670..83ad092bb8 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -12,6 +12,5 @@ endif() add_subdirectory(aie-lsp-server) add_subdirectory(aie-translate) add_subdirectory(aie-visualize) -add_subdirectory(aie2xclbin) add_subdirectory(bootgen) add_subdirectory(chess-clang) diff --git a/tools/aie2xclbin/CMakeLists.txt b/tools/aie2xclbin/CMakeLists.txt deleted file mode 100644 index cd8afac255..0000000000 --- a/tools/aie2xclbin/CMakeLists.txt +++ /dev/null @@ -1,52 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2021 Xilinx Inc. - -set(_aie2xclbin_srcs aie2xclbin.cpp XCLBinGen.cpp) - -add_executable(aie2xclbin ${_aie2xclbin_srcs}) - -target_include_directories(aie2xclbin PUBLIC ${LLVM_INCLUDE_DIRS}) -separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) -llvm_update_compile_flags(aie2xclbin) - -llvm_map_components_to_libnames(llvm_libs support) -target_link_libraries(aie2xclbin ${llvm_libs}) - -get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) - -configure_file(configure.h.in configure.h) -target_include_directories(aie2xclbin PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") - -if(MSVC) - set(UUID "Rpcrt4.lib") -else() - find_library (UUID uuid REQUIRED) -endif() - -target_link_libraries(aie2xclbin - ${dialect_libs} - MLIRParser - MLIRSCFToControlFlow - MLIRAffineToStandard - MLIRAIEVecDialect - MLIRAIEVecToLLVM - MLIRAIEVecTransforms - MLIRXLLVMToLLVMIRTranslation - ADF - AIE - AIETransforms - AIETargets - AIEX - AIEXTransforms - MLIRAIEVecDialect - MLIRXLLVMDialect - ${UUID}) - -install(TARGETS aie2xclbin - EXPORT AIE2XCLBIN - RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR} - COMPONENT aie2xclbin) diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp deleted file mode 100644 index 04f2169c86..0000000000 --- a/tools/aie2xclbin/XCLBinGen.cpp +++ /dev/null @@ -1,957 +0,0 @@ -//===- XCLBinGen.cpp -------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 Xilinx Inc. -// -//===---------------------------------------------------------------------===// - -#include "XCLBinGen.h" -#include - -#include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h" -#include "aie/Dialect/AIE/Transforms/AIEPasses.h" -#include "aie/Dialect/AIEVec/Pipelines/Passes.h" -#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h" -#include "aie/InitialAllDialect.h" -#include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h" -#include "aie/Targets/AIETargets.h" - -#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" -#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" -#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" -#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" -#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" -#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" -#include "mlir/Dialect/MemRef/Transforms/Passes.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Support/FileUtilities.h" -#include "mlir/Target/LLVMIR/Export.h" -#include "mlir/Transforms/Passes.h" - -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/JSON.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/Program.h" -#include "llvm/Support/ToolOutputFile.h" - -#include -#include -#include -#include - -#ifdef _WIN32 -#include "windows.h" -// For UUID stuff -#include "rpcdce.h" - -#define setenv(name, var, ignore) _putenv_s(name, var) -#else -#include -#endif - -using namespace llvm; -using namespace mlir; -using namespace xilinx; - -namespace { - -// Apply the pass manager specific options of the XCLBinGenConfig to the pass -// manager. These control when (if ever) and what IR gets printed between -// passes, and whether the pass manager uses multi-theading. -void applyConfigToPassManager(XCLBinGenConfig &TK, PassManager &pm) { - - pm.getContext()->disableMultithreading(TK.DisableThreading); - - bool printBefore = TK.PrintIRBeforeAll; - auto shouldPrintBeforePass = [printBefore](Pass *, Operation *) { - return printBefore; - }; - - bool printAfter = TK.PrintIRAfterAll; - auto shouldPrintAfterPass = [printAfter](Pass *, Operation *) { - return printAfter; - }; - - pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass, - TK.PrintIRModuleScope); - - bool timing = TK.Timing; - if (timing) - pm.enableTiming(); -} -} // namespace - -void xilinx::findVitis(XCLBinGenConfig &TK) { - const char *env_vitis = ::getenv("VITIS"); - if (env_vitis == nullptr) { - if (auto vpp = sys::findProgramByName("v++")) { - SmallString<64> real_vpp; - std::error_code err = sys::fs::real_path(vpp.get(), real_vpp); - if (!err) { - sys::path::remove_filename(real_vpp); - sys::path::remove_filename(real_vpp); - ::setenv("VITIS", real_vpp.c_str(), 1); - dbgs() << "Found Vitis at " << real_vpp.c_str() << "\n"; - } - } - } - env_vitis = ::getenv("VITIS"); - if (env_vitis != nullptr) { - SmallString<64> vitis_path(env_vitis); - SmallString<64> vitis_bin_path(vitis_path); - sys::path::append(vitis_bin_path, "bin"); - - SmallString<64> aietools_path(vitis_path); - sys::path::append(aietools_path, "aietools"); - if (!sys::fs::exists(aietools_path)) { - aietools_path = vitis_path; - sys::path::append(aietools_path, "cardano"); - } - TK.AIEToolsDir = std::string(aietools_path); - ::setenv("AIETOOLS", TK.AIEToolsDir.c_str(), 1); - - SmallString<64> aietools_bin_path(aietools_path); - sys::path::append(aietools_bin_path, "bin"); - const char *env_path = ::getenv("PATH"); - if (env_path == nullptr) - env_path = ""; - SmallString<128> new_path(env_path); - if (new_path.size()) - new_path += sys::EnvPathSeparator; - new_path += aietools_bin_path; - new_path += sys::EnvPathSeparator; - new_path += vitis_bin_path; - ::setenv("PATH", new_path.c_str(), 1); - } else { - errs() << "VITIS not found ...\n"; - } -} - -static std::string getUUIDString() { - std::string val; -#ifdef _WIN32 - UUID *uuid; - RPC_STATUS status; - status = UuidCreate(uuid); - if (status != RPC_S_OK) - errs() << "Failed to create UUID\n"; - RPC_CSTR *uuidstring; - status = UuidToStringA(uuid, uuidstring); - if (status != RPC_S_OK) - errs() << "Failed to convert UUID to string\n"; - val = std::string((char *)uuidstring); - status = RpcStringFreeA(uuidstring); - if (status != RPC_S_OK) - errs() << "Failed to free UUID string\n"; -#else - uuid_t binuuid; - uuid_generate_random(binuuid); - char uuid[37]; - uuid_unparse_lower(binuuid, uuid); - val = std::string(uuid); -#endif - return val; -} -static void addAIELoweringPasses(OpPassManager &pm) { - pm.addPass(createLowerAffinePass()); - pm.addPass(AIE::createAIECanonicalizeDevicePass()); - OpPassManager &devicePM = pm.nest(); - devicePM.addPass(AIE::createAIEAssignLockIDsPass()); - devicePM.addPass(AIE::createAIEObjectFifoRegisterProcessPass()); - devicePM.addPass(AIE::createAIEObjectFifoStatefulTransformPass()); - devicePM.addPass(AIE::createAIEAssignBufferDescriptorIDsPass()); - devicePM.addPass(AIEX::createAIEBroadcastPacketPass()); - devicePM.addPass(AIEX::createAIELowerMulticastPass()); - devicePM.addPass(AIE::createAIEAssignBufferAddressesPass()); - pm.addPass(createConvertSCFToCFPass()); -} - -static void addLowerToLLVMPasses(OpPassManager &pm) { - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - pm.addPass(xilinx::aievec::createConvertAIEVecToLLVMPass()); - - pm.addPass(createConvertVectorToLLVMPass()); - pm.addPass(memref::createExpandStridedMetadataPass()); - pm.addPass(createLowerAffinePass()); - pm.addPass(createConvertMathToLLVMPass()); - pm.addPass(createArithToLLVMConversionPass()); - pm.addPass(createFinalizeMemRefToLLVMConversionPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - ConvertFuncToLLVMPassOptions opts; - opts.useBarePtrCallConv = true; - pm.addPass(createConvertFuncToLLVMPass(opts)); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); - pm.addPass(createConvertControlFlowToLLVMPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createCSEPass()); -} - -int runTool(StringRef Program, ArrayRef Args, bool Verbose, - std::optional> Env = std::nullopt) { - if (Verbose) { - llvm::outs() << "Run:"; - if (Env) - for (auto &s : *Env) - llvm::outs() << " " << s; - llvm::outs() << " " << Program; - for (auto &s : Args) - llvm::outs() << " " << s; - llvm::outs() << "\n"; - } - std::string err_msg; - sys::ProcessStatistics stats; - std::optional opt_stats(stats); - SmallVector PArgs = {Program}; - PArgs.append(Args.begin(), Args.end()); - - SmallVector tmpPath; - auto ec = llvm::sys::fs::createTemporaryFile("run_tool", "", tmpPath); - if (ec) { - llvm::errs() << "Failed to create temporary file: " << ec.message() << "\n"; - return -1; - } - - // Convert tmpPath to a StringRef: - StringRef tp(tmpPath.begin(), tmpPath.size()); - - int result = sys::ExecuteAndWait(Program, PArgs, Env, {tp, tp, tp}, 0, 0, - &err_msg, nullptr, &opt_stats); - if (Verbose) { - llvm::outs() << (result == 0 ? "Succeeded " : "Failed ") << "in " - << std::chrono::duration_cast>( - stats.TotalTime) - .count() - << " code: " << result << "\n"; - std::ifstream t(tp.str()); - std::stringstream buffer; - buffer << t.rdbuf(); - llvm::outs() << buffer.str(); - } - return result; -} - -template -static void aieTargetDefines(SmallVector &Args, - std::string aie_target) { - if (aie_target == "AIE2") - Args.push_back("-D__AIEARCH__=20"); - else - Args.push_back("-D__AIEARCH__=10"); -} - -// Generate the elf files for the core -static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, - const StringRef objFile, - XCLBinGenConfig &TK) { - auto deviceOps = moduleOp.getOps(); - if (!llvm::hasSingleElement(deviceOps)) - return moduleOp.emitOpError("expected a single device op"); - - AIE::DeviceOp deviceOp = *deviceOps.begin(); - auto tileOps = deviceOp.getOps(); - - std::string errorMessage; - - for (auto tileOp : tileOps) { - int col = tileOp.colIndex(); - int row = tileOp.rowIndex(); - auto coreOp = tileOp.getCoreOp(); - if (!coreOp) - continue; - - std::string elfFileName; - if (auto fileAttr = coreOp.getElfFileAttr()) { - elfFileName = std::string(fileAttr.getValue()); - } else { - elfFileName = std::string("core_") + std::to_string(col) + "_" + - std::to_string(row) + ".elf"; - coreOp.setElfFile(elfFileName); - } - - SmallString<64> elfFile(TK.TempDir); - sys::path::append(elfFile, elfFileName); - - if (TK.UseChess) { - // Use xbridge (to remove any peano dependency with use-chess option) - SmallString<64> bcfPath(TK.TempDir); - sys::path::append(bcfPath, elfFileName + ".bcf"); - - { - auto bcfOutput = openOutputFile(bcfPath, &errorMessage); - if (!bcfOutput) - return coreOp.emitOpError(errorMessage); - - if (failed(AIE::AIETranslateToBCF(moduleOp, bcfOutput->os(), col, row))) - return coreOp.emitOpError("Failed to generate BCF"); - bcfOutput->keep(); - } - - std::vector extractedIncludes; - { - auto bcfFileIn = openInputFile(bcfPath, &errorMessage); - if (!bcfFileIn) - moduleOp.emitOpError(errorMessage); - - std::string bcfFile = std::string(bcfFileIn->getBuffer()); - std::regex r("_include _file (.*)"); - auto begin = std::sregex_iterator(bcfFile.begin(), bcfFile.end(), r); - auto end = std::sregex_iterator(); - for (std::sregex_iterator i = begin; i != end; ++i) - extractedIncludes.push_back(i->str(1)); - } - - SmallString<64> chessWrapperBin(TK.InstallDir); - sys::path::append(chessWrapperBin, "bin", "xchesscc_wrapper"); - SmallString<64> chessworkDir(TK.TempDir); - sys::path::append(chessworkDir, "chesswork"); - - SmallVector flags{StringRef(TK.TargetArch).lower(), - "+w", - std::string(chessworkDir), - "-d", - "+l", - std::string(bcfPath), - "-o", - std::string(elfFile), - "-f", - std::string(objFile)}; - for (const auto &inc : extractedIncludes) - flags.push_back(inc); - - if (runTool(chessWrapperBin, flags, TK.Verbose) != 0) - coreOp.emitOpError("Failed to link with xbridge"); - } else { - SmallString<64> ldscript_path(TK.TempDir); - sys::path::append(ldscript_path, elfFileName + ".ld"); - { - auto ldscript_output = openOutputFile(ldscript_path, &errorMessage); - if (!ldscript_output) - return coreOp.emitOpError(errorMessage); - - if (failed(AIE::AIETranslateToLdScript(moduleOp, ldscript_output->os(), - col, row))) - return coreOp.emitOpError("failed to generate ld script for core (") - << col << "," << row << ")"; - ldscript_output->keep(); - } - - // We are running a clang command for now, but really this is an lld - // command. - { - std::string targetLower = StringRef(TK.TargetArch).lower(); - SmallVector flags; - flags.push_back("-O2"); - std::string targetFlag = "--target=" + targetLower + "-none-elf"; - flags.push_back(targetFlag); - flags.emplace_back(objFile); - SmallString<64> meBasicPath(TK.InstallDir); - sys::path::append(meBasicPath, "aie_runtime_lib", - StringRef(TK.TargetArch).upper(), "me_basic.o"); - flags.emplace_back(meBasicPath); - SmallString<64> libcPath(TK.PeanoDir); - sys::path::append(libcPath, "lib", targetLower + "-none-unknown-elf", - "libc.a"); - flags.emplace_back(libcPath); - flags.push_back("-Wl,--gc-sections"); - std::string ldScriptFlag = "-Wl,-T," + std::string(ldscript_path); - flags.push_back(ldScriptFlag); - flags.push_back("-o"); - flags.emplace_back(elfFile); - SmallString<64> clangBin(TK.PeanoDir); - sys::path::append(clangBin, "bin", "clang"); - if (runTool(clangBin, flags, TK.Verbose) != 0) - return coreOp.emitOpError("failed to link elf file for core(") - << col << "," << row << ")"; - } - } - } - return success(); -} - -static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp, - XCLBinGenConfig &TK) { - ModuleOp copy = moduleOp.clone(); - std::string errorMessage; - // This corresponds to `process_host_cgen`, which is listed as host - // compilation in aiecc.py... not sure we need this. - PassManager passManager(context, ModuleOp::getOperationName()); - applyConfigToPassManager(TK, passManager); - passManager.addNestedPass(AIE::createAIEPathfinderPass()); - - if (failed(passManager.run(copy))) - return moduleOp.emitOpError( - "failed to run passes to prepare of XCLBin generation"); - - if (failed(AIE::AIETranslateToCDODirect(copy, TK.TempDir))) - return moduleOp.emitOpError("failed to emit CDO"); - - copy->erase(); - return success(); -} - -static json::Object makeKernelJSON(std::string name, std::string id, - std::string instance) { - return json::Object{ - {"name", name}, - {"type", "dpu"}, - {"extended-data", json::Object{{"subtype", "DPU"}, - {"functional", "0"}, - {"dpu_kernel_id", id}}}, - {"arguments", json::Array{json::Object{{"name", "opcode"}, - {"address-qualifier", "SCALAR"}, - {"type", "uint64_t"}, - {"offset", "0x00"}}, - json::Object{{"name", "instr"}, - {"memory-connection", "SRAM"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x08"}}, - json::Object{{"name", "ninstr"}, - {"address-qualifier", "SCALAR"}, - {"type", "uint32_t"}, - {"offset", "0x10"}}, - json::Object{{"name", "bo0"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x14"}}, - json::Object{{"name", "bo1"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x1c"}}, - json::Object{{"name", "bo2"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x24"}}, - json::Object{{"name", "bo3"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x2c"}}, - json::Object{{"name", "bo4"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x34"}}, - json::Object{{"name", "bo5"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "void*"}, - {"offset", "0x3c"}}}}, - {"instances", json::Array{json::Object{{"name", instance}}}}}; -} - -static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, - XCLBinGenConfig &TK, - const StringRef &Output, - const StringRef &inputXclbin = "") { - std::string errorMessage; - // Create mem_topology.json. - SmallString<64> memTopologyJsonFile(TK.TempDir); - sys::path::append(memTopologyJsonFile, "mem_topology.json"); - { - auto memTopologyJsonOut = - openOutputFile(memTopologyJsonFile, &errorMessage); - if (!memTopologyJsonOut) - return moduleOp.emitOpError(errorMessage); - - std::string mem_topology_data = R"({ - "mem_topology": { - "m_count": "2", - "m_mem_data": [ - { - "m_type": "MEM_DRAM", - "m_used": "1", - "m_sizeKB": "0x10000", - "m_tag": "HOST", - "m_base_address": "0x4000000" - }, - { - "m_type": "MEM_DRAM", - "m_used": "1", - "m_sizeKB": "0xc000", - "m_tag": "SRAM", - "m_base_address": "0x4000000" - } - ] - } - })"; - memTopologyJsonOut->os() << mem_topology_data; - memTopologyJsonOut->keep(); - } - - // Create aie_partition.json. - SmallString<64> aiePartitionJsonFile(TK.TempDir); - sys::path::append(aiePartitionJsonFile, "aie_partition.json"); - { - auto aiePartitionJsonOut = - openOutputFile(aiePartitionJsonFile, &errorMessage); - if (!aiePartitionJsonOut) - return moduleOp.emitOpError(errorMessage); - - std::string uuid_str = getUUIDString(); - std::string aie_partition_json_data = R"( - { - "aie_partition": { - "name": "QoS", - "operations_per_cycle": "2048", - "inference_fingerprint": "23423", - "pre_post_fingerprint": "12345", - "partition": { - "column_width": 4, - "start_columns": [ - 1 - ] - }, - "PDIs": [ - { - "uuid": ")" + uuid_str + R"(", - "file_name": "./design.pdi", - "cdo_groups": [ - { - "name": "DPU", - "type": "PRIMARY", - "pdi_id": "0x01", - "dpu_kernel_ids": [ - ")" + TK.XCLBinKernelID + - R"(" - ], - "pre_cdo_groups": [ - "0xC1" - ] - } - ] - } - ] - } - } - )"; - aiePartitionJsonOut->os() << aie_partition_json_data; - aiePartitionJsonOut->keep(); - } - - // Create kernels.json. - SmallString<64> kernelsJsonFile(TK.TempDir); - sys::path::append(kernelsJsonFile, "kernels.json"); - { - auto kernelsJsonOut = openOutputFile(kernelsJsonFile, &errorMessage); - if (!kernelsJsonOut) - return moduleOp.emitOpError(errorMessage); - - json::Object kernels_data{ - {"ps-kernels", - json::Object{ - {"kernels", - json::Array{// TODO: Support for multiple kernels - makeKernelJSON(TK.XCLBinKernelName, TK.XCLBinKernelID, - TK.XCLBinInstanceName)}}}}}; - kernelsJsonOut->os() << formatv("{0:2}", - json::Value(std::move(kernels_data))); - kernelsJsonOut->keep(); - } - // Create design.bif. - SmallString<64> designBifFile(TK.TempDir); - sys::path::append(designBifFile, "design.bif"); - { - auto designBifOut = openOutputFile(designBifFile, &errorMessage); - if (!designBifOut) - return moduleOp.emitOpError(errorMessage); - - designBifOut->os() << "all:\n" - << "{\n" - << "\tid_code = 0x14ca8093\n" - << "\textended_id_code = 0x01\n" - << "\timage\n" - << "\t{\n" - << "\t\tname=aie_image, id=0x1c000000\n" - << "\t\t{ type=cdo\n" - << "\t\t file=" << TK.TempDir << "/aie_cdo_elfs.bin\n" - << "\t\t file=" << TK.TempDir << "/aie_cdo_init.bin\n" - << "\t\t file=" << TK.TempDir << "/aie_cdo_enable.bin\n" - << "\t\t}\n" - << "\t}\n" - << "}"; - designBifOut->keep(); - } - - // Execute the bootgen command. - SmallString<64> designPdiFile(TK.TempDir); - sys::path::append(designPdiFile, "design.pdi"); - { - SmallVector flags{"-arch", "versal", - "-image", std::string(designBifFile), - "-o", std::string(designPdiFile), - "-w"}; - - SmallString<64> bootgenBin(TK.InstallDir); - sys::path::append(bootgenBin, "bin", "bootgen"); - if (runTool(bootgenBin, flags, TK.Verbose) != 0) - return moduleOp.emitOpError("failed to execute bootgen"); - } - SmallVector flags; - // Execute the xclbinutil command. - std::string memArg = "MEM_TOPOLOGY:JSON:" + std::string(memTopologyJsonFile); - std::string partArg = - "AIE_PARTITION:JSON:" + std::string(aiePartitionJsonFile); - { - - if (!inputXclbin.empty()) { - // Create aie_partition.json. - SmallString<64> aieInputPartitionJsonFile(TK.TempDir); - sys::path::append(aieInputPartitionJsonFile, "aie_input_partition.json"); - - std::string inputPartArg = - "AIE_PARTITION:JSON:" + std::string(aieInputPartitionJsonFile); - SmallVector inputFlags{"--dump-section", inputPartArg, - "--force", "--input", - std::string(inputXclbin)}; - if (auto xclbinutil = sys::findProgramByName("xclbinutil")) { - if (runTool(*xclbinutil, inputFlags, TK.Verbose) != 0) - return moduleOp.emitOpError("failed to execute xclbinutil"); - } else { - return moduleOp.emitOpError("could not find xclbinutil"); - } - auto aieInputPartitionOut = - openInputFile(aieInputPartitionJsonFile, &errorMessage); - if (!aieInputPartitionOut) - return moduleOp.emitOpError(errorMessage); - Expected aieInputPartitionOutValue = - llvm::json::parse(aieInputPartitionOut->getBuffer()); - json::Array *aieInputPartionPDIs; - aieInputPartionPDIs = aieInputPartitionOutValue->getAsObject() - ->getObject("aie_partition") - ->getArray("PDIs"); - auto aiePartitionOut = openInputFile(aiePartitionJsonFile, &errorMessage); - if (!aiePartitionOut) - return moduleOp.emitOpError(errorMessage); - llvm::Expected aiePartitionOutValue = - llvm::json::parse(aiePartitionOut->getBuffer()); - json::Array *aiePartionPDIs; - aiePartionPDIs = aiePartitionOutValue->getAsObject() - ->getObject("aie_partition") - ->getArray("PDIs"); - aieInputPartionPDIs->insert(aieInputPartionPDIs->end(), - aiePartionPDIs->begin(), - aiePartionPDIs->end()); - // rewrite aie partion json file - auto aiePartitionJsonOut = - openOutputFile(aiePartitionJsonFile, &errorMessage); - if (!aiePartitionJsonOut) - return moduleOp.emitOpError(errorMessage); - aiePartitionJsonOut->os() << formatv("{0:2}", *aieInputPartitionOutValue); - aiePartitionJsonOut->keep(); - flags.insert(flags.end(), {"--input", std::string(inputXclbin)}); - - } else { - flags.insert(flags.end(), {"--add-replace-section", memArg}); - } - flags.insert(flags.end(), {"--add-kernel", std::string(kernelsJsonFile), - "--add-replace-section", partArg, "--force", - "--output", std::string(Output)}); - - if (auto xclbinutil = sys::findProgramByName("xclbinutil")) { - if (runTool(*xclbinutil, flags, TK.Verbose) != 0) - return moduleOp.emitOpError("failed to execute xclbinutil"); - } else { - return moduleOp.emitOpError("could not find xclbinutil"); - } - } - return success(); -} - -static std::string chesshack(const std::string &input) { - std::string result(input); - static const std::unordered_map substitutions{ - {"memory\\(none\\)", "readnone"}, - {"memory\\(read\\)", "readonly"}, - {"memory\\(write\\)", "writeonly"}, - {"memory\\(argmem: readwrite\\)", "argmemonly"}, - {"memory\\(argmem: read\\)", "argmemonly readonly"}, - {"memory\\(argmem: write\\)", "argmemonly writeonly"}, - {"memory\\(inaccessiblemem: write\\)", "inaccessiblememonly writeonly"}, - {"memory\\(inaccessiblemem: readwrite\\)", "inaccessiblememonly"}, - {"memory\\(inaccessiblemem: read\\)", "inaccessiblememonly readonly"}, - {"memory(argmem: readwrite, inaccessiblemem: readwrite)", - "inaccessiblemem_or_argmemonly"}, - {"memory(argmem: read, inaccessiblemem: read)", - "inaccessiblemem_or_argmemonly readonly"}, - {"memory(argmem: write, inaccessiblemem: write)", - "inaccessiblemem_or_argmemonly writeonly"}, - }; - for (const auto &pair : substitutions) - result = std::regex_replace(result, std::regex(pair.first), pair.second); - return result; -} - -// A pass which removes the alignment attribute from llvm load operations, if -// the alignment is less than 4 (2 or 1). -// -// Example replaces: -// -// ``` -// %113 = llvm.load %112 {alignment = 2 : i64} : !llvm.ptr -> vector<32xbf16> -// ``` -// -// with -// -// ``` -// %113 = llvm.load %112 : !llvm.ptr -> vector<32xbf16> -// ``` -// -// If this pass is not included in the pipeline, there is an alignment error -// later in the compilation. This is a temporary workaround while a better -// solution is found: propagation of memref.assume_alignment is one option. See -// also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589 -namespace { -struct RemoveAlignment2FromLLVMLoadPass - : public PassWrapper> { - void runOnOperation() override { - getOperation().walk([](Operation *op) { - if (auto loadOp = dyn_cast(op)) { - auto alignmentAttr = loadOp.getAlignmentAttr(); - if (alignmentAttr) { - int alignmentVal = alignmentAttr.getValue().getSExtValue(); - if (alignmentVal == 2 || alignmentVal == 1) { - loadOp.setAlignment(std::optional()); - } - } - } - }); - } - -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( - RemoveAlignment2FromLLVMLoadPass); -}; -} // namespace - -static LogicalResult generateUnifiedObject(MLIRContext *context, - ModuleOp moduleOp, - XCLBinGenConfig &TK, - const std::string &outputFile) { - PassManager pm(context, moduleOp.getOperationName()); - applyConfigToPassManager(TK, pm); - - xilinx::xllvm::registerXLLVMDialectTranslation(*context); - pm.addNestedPass(AIE::createAIELocalizeLocksPass()); - pm.addNestedPass(AIE::createAIENormalizeAddressSpacesPass()); - pm.addPass(AIE::createAIECoreToStandardPass()); - pm.addPass(AIEX::createAIEXToStandardPass()); - - // Convert specific vector dialect ops (like vector.contract) to the AIEVec - // dialect - { - xilinx::aievec::ConvertVectorToAIEVecOptions vectorToAIEVecOptions{}; - - std::string optionsString = [&]() { - std::ostringstream optionsStringStream; - optionsStringStream << "target-backend="; - optionsStringStream << (TK.UseChess ? "cpp" : "llvmir"); - optionsStringStream << ' ' << "aie-target=aieml"; - return optionsStringStream.str(); - }(); - - if (failed(vectorToAIEVecOptions.parseFromString(optionsString))) { - return moduleOp.emitOpError("Failed to parse options from '") - << optionsString - << "': Failed to construct ConvertVectorToAIEVecOptions."; - } - xilinx::aievec::buildConvertVectorToAIEVec(pm, vectorToAIEVecOptions); - } - - addLowerToLLVMPasses(pm); - pm.addPass(std::make_unique()); - - if (TK.Verbose) { - llvm::outs() << "Running: "; - pm.printAsTextualPipeline(llvm::outs()); - llvm::outs() << "\n"; - } - - ModuleOp copy = moduleOp.clone(); - if (failed(pm.run(copy))) - return moduleOp.emitOpError("Failed to lower to LLVM"); - - SmallString<64> LLVMIRFile(TK.TempDir); - sys::path::append(LLVMIRFile, "input.ll"); - - llvm::LLVMContext llvmContext; - auto llvmModule = translateModuleToLLVMIR(copy, llvmContext); - if (!llvmModule) - return moduleOp.emitOpError("Failed to translate module to LLVMIR"); - - std::string errorMessage; - { - auto output = openOutputFile(LLVMIRFile, &errorMessage); - if (!output) - return moduleOp.emitOpError(errorMessage); - llvmModule->print(output->os(), nullptr); - output->keep(); - } - - if (TK.UseChess) { - SmallString<64> chessWrapperBin(TK.InstallDir); - sys::path::append(chessWrapperBin, "bin", "xchesscc_wrapper"); - - SmallString<64> chessworkDir(TK.TempDir); - sys::path::append(chessworkDir, "chesswork"); - - SmallString<64> chessIntrinsicsLL(TK.InstallDir); - sys::path::append(chessIntrinsicsLL, "aie_runtime_lib", - StringRef(TK.TargetArch).upper(), - "chess_intrinsic_wrapper.ll"); - - std::string llvmirString; - { - raw_string_ostream llvmirStream(llvmirString); - llvmModule->print(llvmirStream, nullptr); - } - - SmallString<64> chesslinkedFile(TK.TempDir); - sys::path::append(chesslinkedFile, "input.chesslinked.ll"); - SmallString<64> llvmLinkBin(TK.PeanoDir); - sys::path::append(llvmLinkBin, "bin", "llvm-link"); - if (!sys::fs::exists(llvmLinkBin)) { - if (auto llvmLink = sys::findProgramByName("llvm-link")) - llvmLinkBin = *llvmLink; - else - moduleOp.emitOpError("Can't find llvm-link"); - } - if (runTool(llvmLinkBin, - {std::string(LLVMIRFile), std::string(chessIntrinsicsLL), "-S", - "-o", std::string(chesslinkedFile)}, - TK.Verbose) != 0) - moduleOp.emitOpError("Couldn't link in the intrinsics"); - - std::string mungedLLVMIR; - { - auto chesslinkedIn = openInputFile(chesslinkedFile, &errorMessage); - if (!chesslinkedIn) - moduleOp.emitOpError(errorMessage); - - mungedLLVMIR = std::string(chesslinkedIn->getBuffer()); - mungedLLVMIR = chesshack(mungedLLVMIR); - } - { - auto chesslinkedOut = openOutputFile(chesslinkedFile); - if (!chesslinkedOut) - moduleOp.emitOpError(errorMessage); - - chesslinkedOut->os() << mungedLLVMIR; - chesslinkedOut->keep(); - } - - if (runTool(chessWrapperBin, - {StringRef(TK.TargetArch).lower(), "+w", - std::string(chessworkDir), "-c", "-d", "-f", "+P", "4", - std::string(chesslinkedFile), "-o", std::string(outputFile)}, - TK.Verbose) != 0) - return moduleOp.emitOpError("Failed to assemble with chess"); - } else { - SmallString<64> peanoOptBin(TK.PeanoDir); - sys::path::append(peanoOptBin, "bin", "opt"); - SmallString<64> peanoLLCBin(TK.PeanoDir); - sys::path::append(peanoLLCBin, "bin", "llc"); - - SmallString<64> OptLLVMIRFile(TK.TempDir); - sys::path::append(OptLLVMIRFile, "input.opt.ll"); - if (runTool(peanoOptBin, - {"-O2", "--inline-threshold=10", "-S", std::string(LLVMIRFile), - "--disable-builtin=memset", "-o", std::string(OptLLVMIRFile)}, - TK.Verbose) != 0) - return moduleOp.emitOpError("Failed to optimize"); - - if (runTool(peanoLLCBin, - {std::string(OptLLVMIRFile), "-O2", - "--march=" + StringRef(TK.TargetArch).lower(), - "--function-sections", "--filetype=obj", "-o", - std::string(outputFile)}, - TK.Verbose) != 0) - return moduleOp.emitOpError("Failed to assemble"); - } - copy->erase(); - return success(); -} - -LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, - XCLBinGenConfig &TK, StringRef OutputNPU, - StringRef OutputXCLBin, - StringRef InputXCLBin = "") { - PassManager pm(ctx, moduleOp.getOperationName()); - applyConfigToPassManager(TK, pm); - - addAIELoweringPasses(pm); - - if (TK.Verbose) { - llvm::outs() << "Running: "; - pm.printAsTextualPipeline(llvm::outs()); - llvm::outs() << "\n"; - } - - if (failed(pm.run(moduleOp))) - return moduleOp.emitOpError("AIE lowering pipline failed"); - - raw_string_ostream target_arch_os(TK.TargetArch); - if (failed(AIE::AIETranslateToTargetArch(moduleOp, target_arch_os))) - return moduleOp.emitOpError("Couldn't detect target architure"); - - TK.TargetArch = StringRef(TK.TargetArch).trim(); - - std::regex target_regex("AIE.?"); - if (!std::regex_search(TK.TargetArch, target_regex)) - return moduleOp.emitOpError() - << "Unexpected target architecture: " << TK.TargetArch; - - // generateNPUInstructions - { - PassManager pm(ctx, moduleOp.getOperationName()); - applyConfigToPassManager(TK, pm); - - pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); - ModuleOp copy = moduleOp.clone(); - if (failed(pm.run(copy))) - return moduleOp.emitOpError("NPU Instruction pipeline failed"); - - std::string errorMessage; - auto output = openOutputFile(OutputNPU, &errorMessage); - if (!output) { - llvm::errs() << errorMessage << "\n"; - return moduleOp.emitOpError(""); - } - - if (failed(AIE::AIETranslateToNPU(copy, output->os()))) - return moduleOp.emitOpError("NPU Instruction translation failed"); - - output->keep(); - copy->erase(); - } - - SmallString<64> unifiedObj(TK.TempDir); - sys::path::append(unifiedObj, "input.o"); - if (failed(generateUnifiedObject(ctx, moduleOp, TK, std::string(unifiedObj)))) - return moduleOp.emitOpError("Failed to generate unified object"); - - if (failed(generateCoreElfFiles(moduleOp, unifiedObj, TK))) - return moduleOp.emitOpError("Failed to generate core ELF file(s)"); - - if (failed(generateCDO(ctx, moduleOp, TK))) - return moduleOp.emitOpError("Failed to generate CDO"); - - if (failed(generateXCLBin(ctx, moduleOp, TK, OutputXCLBin, InputXCLBin))) - return moduleOp.emitOpError("Failed to generate XCLBin"); - - return success(); -} diff --git a/tools/aie2xclbin/XCLBinGen.h b/tools/aie2xclbin/XCLBinGen.h deleted file mode 100644 index a7e115dcf3..0000000000 --- a/tools/aie2xclbin/XCLBinGen.h +++ /dev/null @@ -1,48 +0,0 @@ -//===- XCLBinGen.h ---------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 Xilinx Inc. -// -//===---------------------------------------------------------------------===// - -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/Support/LogicalResult.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringRef.h" -#include - -#pragma once - -namespace xilinx { - -struct XCLBinGenConfig { - std::string TargetArch; - std::string PeanoDir; - std::string InstallDir; - std::string AIEToolsDir; - std::string TempDir; - bool Verbose; - std::string HostArch; - std::string XCLBinKernelName; - std::string XCLBinKernelID; - std::string XCLBinInstanceName; - bool UseChess = false; - bool DisableThreading = false; - bool PrintIRAfterAll = false; - bool PrintIRBeforeAll = false; - bool PrintIRModuleScope = false; - bool Timing = false; -}; - -void findVitis(XCLBinGenConfig &TK); - -mlir::LogicalResult aie2xclbin(mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp, - XCLBinGenConfig &TK, llvm::StringRef outputNPU, - llvm::StringRef outputXCLBin, - llvm::StringRef inputXCLBin); - -} // namespace xilinx diff --git a/tools/aie2xclbin/aie2xclbin.cpp b/tools/aie2xclbin/aie2xclbin.cpp deleted file mode 100644 index dac737245f..0000000000 --- a/tools/aie2xclbin/aie2xclbin.cpp +++ /dev/null @@ -1,228 +0,0 @@ -//===- aiecc.cpp -----------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 Xilinx Inc. -// -//===---------------------------------------------------------------------===// - -#include "XCLBinGen.h" -#include "configure.h" - -#include "aie/Dialect/AIE/Transforms/AIEPasses.h" -#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h" -#include "aie/InitialAllDialect.h" -#include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h" -#include "aie/Targets/AIETargets.h" - -#include "mlir/Dialect/Affine/Passes.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/MLIRContext.h" -#include "mlir/IR/OwningOpRef.h" -#include "mlir/Parser/Parser.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Export.h" -#include "mlir/Tools/mlir-translate/Translation.h" - -#include "llvm/ADT/SmallString.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/JSON.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/Program.h" -#include "llvm/Support/SourceMgr.h" - -#include -#include -#include -#include - -using namespace llvm; -using namespace mlir; -using namespace xilinx; - -cl::OptionCategory AIE2XCLBinCat("AIE To XCLBin Options", - "Options specific to the aie2xclbin tool"); - -cl::opt FileName(cl::Positional, cl::desc(""), - cl::Required, cl::cat(AIE2XCLBinCat)); - -cl::opt - TmpDir("tmpdir", cl::desc("Directory used for temporary file storage"), - cl::cat(AIE2XCLBinCat)); - -cl::opt Verbose("v", cl::desc("Trace commands as they are executed"), - cl::cat(AIE2XCLBinCat)); - -cl::opt - Peano("peano", cl::desc("Root directory where peano compiler is installed"), - cl::cat(AIE2XCLBinCat)); - -cl::opt - HostArch("host-target", cl::desc("Target architecture of the host program"), - cl::init(HOST_ARCHITECTURE), cl::cat(AIE2XCLBinCat)); - -cl::opt - NPUInstsName("npu-insts-name", - cl::desc("Output instructions filename for NPU target"), - cl::init("npu_insts.txt"), cl::cat(AIE2XCLBinCat)); - -cl::opt - PrintIRAfterAll("print-ir-after-all", - cl::desc("Configure all pass managers in lowering from aie " - "to xclbin to print IR after all passes"), - cl::init(false), cl::cat(AIE2XCLBinCat)); - -cl::opt - Timing("timing", - cl::desc("Configure all pass managers in lowering from aie to " - "xclbin to print timing information for each pass"), - cl::init(false), cl::cat(AIE2XCLBinCat)); - -cl::opt - PrintIRBeforeAll("print-ir-before-all", - cl::desc("Configure all pass managers in lowering from " - "aie to xclbin to print IR before all passes"), - cl::init(false), cl::cat(AIE2XCLBinCat)); - -cl::opt - DisableThreading("disable-threading", - cl::desc("Configure all pass managers in lowering from " - "aie to xclbin to disable multithreading"), - cl::init(false), cl::cat(AIE2XCLBinCat)); - -cl::opt PrintIRModuleScope( - "print-ir-module-scope", - cl::desc("Configure all pass managers in lowering from aie to xclbin to " - "print IR at the module scope"), - cl::init(false), cl::cat(AIE2XCLBinCat)); - -cl::opt - XCLBinName("xclbin-name", - cl::desc("Output xclbin filename for CDO/XCLBIN target"), - cl::init("final.xclbin"), cl::cat(AIE2XCLBinCat)); - -cl::opt inputXCLBinName( - "input-xclbin-name", - cl::desc("input xclbin filename on which new xclbin is merged into"), - cl::init(""), cl::cat(AIE2XCLBinCat)); - -cl::opt XCLBinKernelName("xclbin-kernel-name", - cl::desc("Kernel name in xclbin file"), - cl::init("MLIR_AIE"), - cl::cat(AIE2XCLBinCat)); - -cl::opt - XCLBinInstanceName("xclbin-instance-name", - cl::desc("Instance name in xclbin metadata"), - cl::init("MLIRAIEV1"), cl::cat(AIE2XCLBinCat)); - -cl::opt XCLBinKernelID("xclbin-kernel-id", - cl::desc("Kernel id in xclbin file"), - cl::init("0x901"), cl::cat(AIE2XCLBinCat)); - -cl::opt InstallDir("install-dir", - cl::desc("Root of mlir-aie installation"), - cl::cat(AIE2XCLBinCat)); - -cl::opt UseChess("use-chess", - cl::desc("Use chess compiler instead of peano"), - cl::cat(AIE2XCLBinCat)); - -int main(int argc, char *argv[]) { - registerAsmPrinterCLOptions(); - registerMLIRContextCLOptions(); - registerPassManagerCLOptions(); - registerTranslationCLOptions(); - cl::ParseCommandLineOptions(argc, argv); - - XCLBinGenConfig TK; - TK.Verbose = Verbose; - TK.HostArch = HostArch; - TK.XCLBinKernelName = XCLBinKernelName; - TK.XCLBinKernelID = XCLBinKernelID; - TK.XCLBinInstanceName = XCLBinInstanceName; - TK.UseChess = UseChess; - TK.DisableThreading = DisableThreading; - TK.PrintIRAfterAll = PrintIRAfterAll; - TK.PrintIRBeforeAll = PrintIRBeforeAll; - TK.PrintIRModuleScope = PrintIRModuleScope; - TK.Timing = Timing; - - if (TK.UseChess) - findVitis(TK); - - if (Verbose) - llvm::dbgs() << "\nCompiling " << FileName << "\n"; - - if (InstallDir.size()) { - TK.InstallDir = InstallDir; - } else { - // Navigate up from install/bin/aie2xclbin to install/ - TK.InstallDir = sys::path::parent_path(sys::path::parent_path(argv[0])); - } - TK.PeanoDir = Peano.getValue(); - if (!TK.UseChess && !sys::fs::is_directory(TK.PeanoDir)) { - llvm::errs() << "Peano path \"" << TK.PeanoDir << "\" is invalid\n"; - return 1; - } - - if (TmpDir.size()) - TK.TempDir = TmpDir.getValue(); - else - TK.TempDir = FileName + ".prj"; - - std::error_code err; - SmallString<64> tmpDir(TK.TempDir); - err = sys::fs::make_absolute(tmpDir); - if (err) - llvm::errs() << "Failed to make absolute path: " << err.message() << "\n"; - - TK.TempDir = std::string(tmpDir); - - err = sys::fs::create_directories(TK.TempDir); - if (err) { - llvm::errs() << "Failed to create temporary directory " << TK.TempDir - << ": " << err.message() << "\n"; - return 1; - } - - if (Verbose) - llvm::errs() << "Created temporary directory " << TK.TempDir << "\n"; - - MLIRContext ctx; - ParserConfig pcfg(&ctx); - SourceMgr srcMgr; - - DialectRegistry registry; - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - registry.insert(); - xilinx::registerAllDialects(registry); - registerBuiltinDialectTranslation(registry); - registerLLVMDialectTranslation(registry); - xilinx::xllvm::registerXLLVMDialectTranslation(registry); - ctx.appendDialectRegistry(registry); - - OwningOpRef owning = - parseSourceFile(FileName, srcMgr, pcfg); - - if (!owning) - return 1; - - if (failed(aie2xclbin(&ctx, *owning, TK, NPUInstsName.getValue(), - XCLBinName.getValue(), inputXCLBinName.getValue()))) - return 1; - - return 0; -} diff --git a/tools/aie2xclbin/configure.h.in b/tools/aie2xclbin/configure.h.in deleted file mode 100644 index 6c828a854e..0000000000 --- a/tools/aie2xclbin/configure.h.in +++ /dev/null @@ -1 +0,0 @@ -#define HOST_ARCHITECTURE "@LLVM_HOST_TRIPLE@"