From a863d44d1811aa87eda2fe199547b16f48f5e8ed Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Thu, 29 Aug 2024 17:28:32 -0700 Subject: [PATCH] Configure AIE using control packets (#1728) Co-authored-by: Jeff Fifield --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 2 +- lib/Targets/AIETargetNPU.cpp | 2 +- python/compiler/txn2mlir.py | 140 +++-- test/npu-xrt/add_one_two_txn/run.lit | 2 +- test/npu-xrt/ctrl_packet_reconfig/aie.mlir | 415 +++++++++++++ .../ctrl_packet_reconfig/ctrl_pkts.txt | 555 ++++++++++++++++++ test/npu-xrt/ctrl_packet_reconfig/run.lit | 10 + test/npu-xrt/ctrl_packet_reconfig/test.cpp | 137 +++++ test/txn2mlir/generate_ctrl_pkt.mlir | 30 + test/txn2mlir/roundtrip_npu1_1col.mlir | 3 +- test/txn2mlir/roundtrip_npu1_4col.mlir | 3 +- 11 files changed, 1254 insertions(+), 45 deletions(-) create mode 100644 test/npu-xrt/ctrl_packet_reconfig/aie.mlir create mode 100644 test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt create mode 100644 test/npu-xrt/ctrl_packet_reconfig/run.lit create mode 100644 test/npu-xrt/ctrl_packet_reconfig/test.cpp create mode 100644 test/txn2mlir/generate_ctrl_pkt.mlir diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 44ceafdd25..765a2a12ab 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -641,4 +641,4 @@ LogicalResult AIEX::DMAStartBdChainOp::verify() { } } return success(); -} \ No newline at end of file +} diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp index 24a40c3385..641ed255d2 100644 --- a/lib/Targets/AIETargetNPU.cpp +++ b/lib/Targets/AIETargetNPU.cpp @@ -276,7 +276,7 @@ xilinx::AIE::AIETranslateToControlPackets(ModuleOp module, } return (p % 2) == 0; }; - uint32_t addr = op.getAddress(); + uint32_t addr = op.getAddress() & 0xFFFFF; uint32_t beats = size - 1; uint32_t opc = op.getOpcode(); uint32_t id = op.getStreamId(); diff --git a/python/compiler/txn2mlir.py b/python/compiler/txn2mlir.py index 402adc8c23..3f5a09de99 100755 --- a/python/compiler/txn2mlir.py +++ b/python/compiler/txn2mlir.py @@ -14,6 +14,8 @@ import sys import struct +import argparse +import aie.extras.types as T def print_none(*args): @@ -63,7 +65,7 @@ def parse_txn(data, verbose=False): _, addr, size = struct.unpack("III", data[i + 4 : i + 16]) print_log(f"addr: {addr:#x}") print_log(f"size: {size}") - operations.append((opc, addr, data[i + 16 : i + size - 16])) + operations.append((opc, addr, data[i + 16 : i + size])) i = i + size elif opc == 0x03: print_log("opcode: MASKWRITE (0x03)") @@ -113,7 +115,7 @@ def parse_txn(data, verbose=False): return num_cols, operations -def operations_to_mlir(operations, columns=5): +def operations_to_mlir(operations, columns=5, mlir_ctrl_pkt=False): with Context(), Location.unknown(): module = Module.create() global_data = [] @@ -137,49 +139,107 @@ def device_body(): else: global_data.append(None) - @runtime_sequence() - def sequence(): - for op, payload in zip(operations, global_data): - if op[0] == 0x00: - addr = op[1] - value = op[2] - npu_write32(addr, value) - elif op[0] == 0x01: - addr = op[1] - d = memref.get_global( - payload.type_.value, payload.sym_name.value - ) - npu_blockwrite(addr, d) - elif op[0] == 0x03: - addr = op[1] - value = op[2] - mask = op[3] - npu_maskwrite32(addr, value, mask) - else: - raise Exception(f"Unhandled op: {op:#x}") + if mlir_ctrl_pkt: + # Runtime sequence arg0 as handle for ctrl packet raw data in host ddr + MAX_CTRL_PKTS_HOST_SIZE = 2048 + + @runtime_sequence(T.memref(MAX_CTRL_PKTS_HOST_SIZE, T.i32())) + def sequence(arg0): + for op, payload in zip(operations, global_data): + if op[0] == 0x00: + addr = op[1] + value = op[2] + control_packet( + address=addr, + opcode=0, + stream_id=0, + data=np.array([value]).astype(np.int32), + ) + elif op[0] == 0x01: + addr = op[1] + data = np.array(payload.initial_value, dtype=np.int32) + # Individual access cannot cross a 128-bit boundary. + num_split_4s = (data.size + 3) // 4 + data_split_4 = data + if num_split_4s > 1: + data_split_4 = np.array_split( + data[: (num_split_4s - 1) * 4], num_split_4s + ) + data_split_4 = data_split_4.append( + data[(num_split_4s - 1) * 4 :] + ) + if num_split_4s == 2: + # Individual access cannot cross a 128-bit boundary. + data_split_4 = [data[:4], data[4:]] + for d_split in data_split_4: + control_packet( + address=addr, + opcode=0, + stream_id=0, + data=d_split, + ) + addr = addr + d_split.size * 4 + elif op[0] == 0x03: + addr = op[1] + value = op[2] + # mask (op[3]) is ignored, as control packet cannot do masked write + control_packet( + address=addr, + opcode=0, + stream_id=0, + data=np.array([value], dtype=np.int32), + ) + else: + raise Exception(f"Unhandled op: {op:#x}") + + else: + + @runtime_sequence() + def sequence(): + for op, payload in zip(operations, global_data): + if op[0] == 0x00: + addr = op[1] + value = op[2] + npu_write32(addr, value) + elif op[0] == 0x01: + addr = op[1] + d = memref.get_global( + payload.type_.value, payload.sym_name.value + ) + npu_blockwrite(addr, d) + elif op[0] == 0x03: + addr = op[1] + value = op[2] + mask = op[3] + npu_maskwrite32(addr, value, mask) + else: + raise Exception(f"Unhandled op: {op:#x}") return module if __name__ == "__main__": - # Check if command line arguments are provided - if len(sys.argv) == 1: - # Read data from standard input - data = sys.stdin.buffer.read() + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("-file", "-f", type=argparse.FileType("rb"), nargs="*") + parser.add_argument( + "-generate-ctrl-pkt", + dest="mlir_ctrl_pkt", + default=False, + action="store_true", + help="Enable MLIR control packet op generation", + ) + args = parser.parse_args() + + # Process each file provided as command line argument + operations = [] + for f in args.file: + # Read the data from the file + data = f.read() # Parse the TXN data - columns, operations = parse_txn(data) - else: - # Process each file provided as command line argument - operations = [] - for filename in sys.argv[1:]: - # Open the file in binary mode - with open(filename, "rb") as f: - # Read the data from the file - data = f.read() - # Parse the TXN data - columns, ops = parse_txn(data) - operations = operations + ops - - module = operations_to_mlir(operations, columns) + columns, ops = parse_txn(data) + operations = operations + ops + + module = operations_to_mlir(operations, columns, mlir_ctrl_pkt=args.mlir_ctrl_pkt) print(str(module)) diff --git a/test/npu-xrt/add_one_two_txn/run.lit b/test/npu-xrt/add_one_two_txn/run.lit index 32a30dad39..ea2723cf9f 100644 --- a/test/npu-xrt/add_one_two_txn/run.lit +++ b/test/npu-xrt/add_one_two_txn/run.lit @@ -6,7 +6,7 @@ // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem // RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=add_one_insts.txt %S/aie1.mlir // RUN: %python aiecc.py --no-aiesim --aie-generate-txn --aie-generate-npu --no-compile-host --npu-insts-name=add_two_insts.txt %S/aie2.mlir -// RUN: %python txn2mlir.py aie2.mlir.prj/txn.bin > add_two_cfg.mlir +// RUN: %python txn2mlir.py -f aie2.mlir.prj/txn.bin > add_two_cfg.mlir // RUN: aie-translate -aie-npu-instgen -aie-output-binary=true add_two_cfg.mlir -o add_two_cfg.bin // RUN: %run_on_npu ./test.exe -x add_one.xclbin -i add_one_insts.txt -c add_two_cfg.bin -j add_two_insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/npu-xrt/ctrl_packet_reconfig/aie.mlir b/test/npu-xrt/ctrl_packet_reconfig/aie.mlir new file mode 100644 index 0000000000..ec7a9bcc30 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/aie.mlir @@ -0,0 +1,415 @@ +module { + aie.device(npu1_1col) { + memref.global "public" @ctrlpkt0 : memref<1024xi32> + memref.global "public" @objFifo_out0 : memref<64x64xi8> + %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + + aie.packet_flow(0) { + aie.packet_source<%tile_0_1, DMA : 0> + aie.packet_dest<%tile_0_2, DMA : 0> + } + aie.packet_flow(1) { + aie.packet_source<%tile_0_2, DMA : 0> + aie.packet_dest<%tile_0_1, DMA : 1> + } + aie.packet_flow(2) { + aie.packet_source<%tile_0_1, DMA : 1> + aie.packet_dest<%tile_0_0, DMA : 0> + } + aie.packet_flow(3) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_1, DMA : 0> + } + aie.packet_flow(4) { + aie.packet_source<%tile_0_0, Ctrl : 0> + aie.packet_dest<%tile_0_0, South : 0> + } {keep_pkt_header = true} + // TODO: make shim tile ctrl packet flow part of the column control overlay + // aie.packet_flow(4) { + // aie.packet_source<%tile_0_0, DMA : 0> + // aie.packet_dest<%tile_0_0, Ctrl : 0> + // } {keep_pkt_header = true} + aie.packet_flow(5) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_1, Ctrl : 0> + } {keep_pkt_header = true} + aie.packet_flow(6) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, Ctrl : 0> + } {keep_pkt_header = true} + aie.shim_dma_allocation @ctrlpkt0(MM2S, 0, 0) + aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + aiex.runtime_sequence(%arg0: memref<64x64xi8>, %arg2: memref<64x64xi8>, %arg3: memref<1024xi32>) { + + // Reset core (0,2) + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // Reset DMA channels (leads to deadlock) + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 2][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 4][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 6][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 8][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // Load core tile (0,2) program memory + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 10][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 14][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 18][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 22][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 27][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 32][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 37][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 42][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 47][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 52][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 57][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 62][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 67][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 72][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 77][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 82][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 87][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 92][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 97][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 102][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 107][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 112][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 117][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 122][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 127][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 132][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 137][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 142][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 147][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 152][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 157][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 162][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 167][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 172][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 177][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 182][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 187][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 192][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 197][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 202][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 207][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 212][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 217][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 222][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 227][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 232][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 237][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 242][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 247][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 252][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 257][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 262][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 267][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 272][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 277][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 282][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 287][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 292][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 297][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 302][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 307][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 312][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 317][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 322][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 327][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 332][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 337][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 339][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 341][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 343][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 345][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 347][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // Core tile (0,2) locks + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 349][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 351][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 353][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 355][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 357][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 359][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 361][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 363][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 365][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 367][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 369][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 371][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 373][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 375][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 377][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 379][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 381][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 383][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 385][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 387][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // memtile (0,1) locks + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 389][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 391][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 393][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 395][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // core tile bds + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 397][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 402][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 405][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 410][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 413][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 415][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 417][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 419][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // memtile bds + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 421][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 426][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 431][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 436][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 441][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 446][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 451][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 456][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 461][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 463][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 465][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 467][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 469][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 471][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 473][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 475][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // shim tile (0,0) bds + // TODO: make shim tile ctrl packet flow part of the column control overlay + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 477][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 479][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 481][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 483][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 485][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 487][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 489][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 491][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 493][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // memtile stream switches + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 495][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 497][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 499][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 501][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 503][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 505][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 507][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 509][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 511][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 513][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 515][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 517][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 519][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 521][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 523][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 525][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 527][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 529][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // core tile stream switches + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 531][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 533][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 535][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 537][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 539][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 541][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 543][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 545][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 547][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // shim tile stream switches + // TODO: make shim tile ctrl packet flow part of the column control overlay + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 549][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 551][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 553][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // AIE design's instructions + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c56_i64 = arith.constant 56 : i64 + %c61_i64 = arith.constant 61 : i64 + %c64_i64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @ctrlpkt0} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + + } + + } +} + diff --git a/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt b/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt new file mode 100644 index 0000000000..e470486471 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt @@ -0,0 +1,555 @@ +00032000 +00000000 +8001DE10 +00000002 +0001DE18 +00000002 +0001DE00 +00000002 +8001DE08 +00000002 +00204400 +00000260 +00000000 +00000000 +0020440C +00000000 +00000000 +00000000 +00204418 +00000000 +00000000 +00000008 +00320000 +38001043 +000001C3 +08000055 +00550000 +80320010 +00000C00 +16310799 +40400195 +0001C000 +80320020 +00010001 +00010001 +FC7FF855 +7659FFFF +00320030 +782F1F30 +04B20000 +00000062 +00000000 +80320040 +DC8C764D +0001DFF0 +00010001 +00010001 +00320050 +14190001 +00011000 +00010001 +00010001 +00320060 +14B10899 +20400195 +0001C000 +00010001 +80320070 +00010001 +183E7659 +244B2003 +00000000 +80320080 +70000115 +00010000 +00010001 +00010001 +00320090 +88000115 +06990001 +00011830 +00010001 +003200A0 +880003C0 +00000003 +00000000 +00000000 +803200B0 +00010001 +00010001 +00010001 +10000819 +003200C0 +00010001 +00010001 +00010001 +00000019 +803200D0 +68000095 +00010000 +00010001 +00010001 +803200E0 +38032019 +0FFFC299 +10000115 +C93B0001 +003200F0 +062827FF +FFECC000 +00010001 +00010001 +80320100 +10000115 +C8430001 +064827FF +00010000 +00320110 +00010001 +00000137 +00000000 +00000000 +00320120 +38B010BB +08000000 +805507E6 +000008E3 +80320130 +00680055 +10BB0007 +01C89A00 +00000000 +00320140 +100003C0 +000118A8 +00000000 +00000000 +80320150 +7801FD1D +000102A6 +00010001 +00010001 +80320160 +880003C0 +00000003 +00000000 +000022A0 +00320170 +380003C0 +0000064F +78000000 +000002A6 +00320180 +880003C0 +00000003 +00000000 +00000000 +80320190 +880003C0 +00000003 +00000000 +00000000 +803201A0 +880003C0 +00000003 +00000000 +00000000 +003201B0 +880003C0 +00000003 +00000000 +00000000 +803201C0 +880003C0 +00000003 +00000000 +00000000 +003201D0 +16308C19 +000C9E6D +000122A0 +00010001 +003201E0 +01150001 +00012000 +2003C843 +00000608 +803201F0 +00010001 +8EBB0001 +00000003 +00000000 +80320200 +07FFC2D9 +07FF6659 +20000095 +00010001 +00320210 +00010001 +C83B0001 +10280067 +07FFE000 +00320220 +10121219 +10001819 +00010001 +00010001 +80320230 +880003C0 +00000003 +00000000 +00000000 +00320240 +10101219 +10001819 +00010001 +00010001 +80320250 +880003C0 +00000003 +00000000 +00000000 +80320260 +00000055 +00550000 +00000C00 +16320799 +00320270 +70400195 +2019C801 +E4193803 +E2190FFE +00320280 +20030FFF +C002273B +2003FFEC +50039E0B +80320290 +007FFFC8 +00020000 +0000004C +00000000 +803202A0 +070386D9 +00010001 +00010001 +00010001 +003202B0 +10001419 +1C8E7659 +00010001 +00010001 +803202C0 +14F12899 +50400195 +0001C001 +00010001 +003202D0 +782F0001 +00380000 +00000040 +00000000 +003202E0 +07FE42D9 +07FEE459 +07FFE259 +07FF6659 +803202F0 +00010001 +18190001 +96451000 +07FFE738 +00320300 +00010001 +8EBB0001 +00000003 +00000000 +80320310 +1A1010B7 +200001D0 +FFA04803 +62001077 +80320320 +C28001D0 +00508FFF +A31B2843 +00004108 +00320330 +19B8113B +40000003 +2003FFE9 +0002001B +80320340 +36590051 +36591C4B +36591C8A +28431CC5 +00320350 +2F1A849B +28BB0004 +3106A41B +00000006 +00320360 +280003C0 +60F798CB +00000005 +00000000 +80320370 +07038ED9 +07FB86D9 +00010001 +00010001 +80320380 +54190001 +00011000 +00010001 +00000019 +00320390 +280003C0 +0002E00B +00000000 +00000000 +003203A0 +15AD8C19 +180B9659 +00010001 +00010001 +803203B0 +07FFC2D9 +07FF4259 +00010001 +00010001 +003203C0 +18190001 +00011000 +15602019 +4CCB2843 +803203D0 +00292101 +CC4B28B7 +00254102 +00000000 +803203E0 +280003C0 +C1000C0B +00000002 +FFFC0000 +8001DE10 +00000000 +0001DE18 +00000000 +0001DE00 +00000000 +8001DE08 +00000000 +00032000 +00000002 +00032000 +00000000 +0001F000 +00000000 +8001F010 +00000000 +8001F020 +00000000 +0001F030 +00000000 +8001F040 +00000000 +0001F050 +00000000 +0001F060 +00000000 +8001F070 +00000000 +8001F080 +00000000 +0001F090 +00000000 +0001F0A0 +00000000 +8001F0B0 +00000000 +0001F0C0 +00000000 +8001F0D0 +00000000 +8001F0E0 +00000000 +0001F0F0 +00000000 +0001F000 +00000001 +8001F010 +00000000 +8001F020 +00000001 +0001F030 +00000000 +800C0000 +00000001 +000C0010 +00000000 +000C0020 +00000001 +800C0030 +00000000 +8031D000 +00400400 +00000000 +00000000 +00000000 +8011D010 +00000000 +06043FE0 +0031D020 +02400400 +40080000 +00000000 +00000000 +0011D030 +00000000 +0E045FE3 +8001DE04 +00000000 +0001DE00 +00000001 +0001DE14 +00000001 +8001DE10 +00000001 +803A0000 +00000400 +000A0000 +00000000 +00000000 +003A0010 +00000000 +00000000 +00000000 +8141FF40 +003A0020 +80000400 +001A0000 +00000000 +00000000 +803A0030 +00000000 +00000000 +00000000 +8140FF41 +803A0300 +81000400 +018A0800 +00000000 +00000000 +003A0310 +00000000 +00000000 +00000000 +8142FF43 +003A0320 +00000400 +019A0800 +00000000 +00000000 +803A0330 +00000000 +00000000 +00000000 +8143FF42 +000A0604 +00000000 +800A0600 +00000001 +000A0634 +00000001 +800A0630 +00000001 +800A063C +00000018 +000A0638 +00000001 +800A060C +00000019 +000A0608 +00000001 +0003F010 +C0000088 +0003F008 +C000000A +0003F040 +C0000009 +0003F114 +C0000000 +0003F250 +00180101 +0003F100 +C0000000 +0003F200 +041F0102 +8003F140 +C0000000 +8003F300 +021F0100 +000B0000 +C000008B +800B0004 +C0000089 +000B0024 +C000000A +000B0030 +C0000008 +000B003C +C000000D +000B0018 +C000000C +000B012C +C0000000 +000B02B0 +061F0105 +000B012C +C0000000 +800B02B4 +051F0104 +000B012C +C0000000 +800B02B8 +031F0103 +000B0104 +C0000000 +000B0210 +021F0102 +000B0134 +C0000000 +000B02D0 +011F0101 +800B0100 +C0000000 +800B0200 +001F0100 +0003F004 +C0000088 +8003F014 +C0000009 +8003F00C +C000000A +0003F124 +C0000000 +0003F290 +061F0102 +8003F104 +C0000000 +8003F210 +011F0101 +0003F118 +C0000000 +0003F260 +001F0100 +0001F000 +00000400 +8001F004 +00000010 +00032000 +00000001 diff --git a/test/npu-xrt/ctrl_packet_reconfig/run.lit b/test/npu-xrt/ctrl_packet_reconfig/run.lit new file mode 100644 index 0000000000..f9ab487b9c --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/run.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir +// RUN: cp %S/ctrl_pkts.txt . +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! diff --git a/test/npu-xrt/ctrl_packet_reconfig/test.cpp b/test/npu-xrt/ctrl_packet_reconfig/test.cpp new file mode 100644 index 0000000000..32bb72fbf9 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/test.cpp @@ -0,0 +1,137 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 64 * 64; +constexpr int OUT_SIZE = 64 * 64; +constexpr int CTRL_IN_SIZE = 1024; + +#define IN_DATATYPE int8_t +#define OUT_DATATYPE int8_t + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + std::vector instr_v = load_instr_sequence("insts.txt"); + std::vector ctrlPackets = load_instr_sequence("ctrl_pkts.txt"); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + auto xclbin = xrt::xclbin("aie.xclbin"); + + std::string Node = "MLIR_AIE"; + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_ctrlpkt = xrt::bo(device, CTRL_IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + IN_DATATYPE *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + void *bufctrlpkt = bo_ctrlpkt.map(); + memcpy(bufctrlpkt, ctrlPackets.data(), ctrlPackets.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_ctrlpkt.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_ctrlpkt); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + OUT_DATATYPE *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < 64; i++) { + for (uint32_t j = 0; j < 64; j++) { + uint32_t ref = 1 + 12; + if (*(bufOut + i * 64 + j) != ref) { + std::cout << "Error in output " << std::to_string(bufOut[i * 64 + j]) + << " != " << ref << std::endl; + errors++; + } + // else + // std::cout << "Correct output " << std::to_string(bufOut[i * 64 + j]) + // << " == " << ref << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } + + std::cout << "\nfailed.\n\n"; + return 1; +} diff --git a/test/txn2mlir/generate_ctrl_pkt.mlir b/test/txn2mlir/generate_ctrl_pkt.mlir new file mode 100644 index 0000000000..98087cdd97 --- /dev/null +++ b/test/txn2mlir/generate_ctrl_pkt.mlir @@ -0,0 +1,30 @@ +//===- generate_ctrl_pkt.mlir -----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-translate -aie-npu-instgen -aie-output-binary=true %s -o ./generate_ctrl_pkt_cfg.bin +// RUN: %python txn2mlir.py -f ./generate_ctrl_pkt_cfg.bin -generate-ctrl-pkt | FileCheck %s + +// CHECK: aie.device(npu1_1col) +// CHECK: memref.global "private" constant @blockwrite_data : memref<6xi32> = dense<[4195328, 0, 0, 0, 0, 100941792]> +// CHECK: aiex.control_packet {address = 2301952 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} +// CHECK: aiex.control_packet {address = 2224128 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} +// CHECK: aiex.control_packet {address = 2215936 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} +// CHECK: aiex.control_packet {address = 2215952 : ui32, data = array, opcode = 0 : i32, stream_id = 0 : i32} +module { + aie.device(npu1_1col) { + memref.global "private" constant @blockwrite_data : memref<6xi32> = dense<[4195328, 0, 0, 0, 0, 100941792]> + aiex.runtime_sequence() { + aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 2 : ui32, value = 2 : ui32} + aiex.npu.write32 {address = 2224128 : ui32, value = 2 : ui32} + %2 = memref.get_global @blockwrite_data : memref<6xi32> + aiex.npu.blockwrite(%2) {address = 2215936 : ui32} : memref<6xi32> + } + } +} diff --git a/test/txn2mlir/roundtrip_npu1_1col.mlir b/test/txn2mlir/roundtrip_npu1_1col.mlir index efe751f29c..b251f115d1 100644 --- a/test/txn2mlir/roundtrip_npu1_1col.mlir +++ b/test/txn2mlir/roundtrip_npu1_1col.mlir @@ -8,7 +8,8 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-translate -aie-npu-instgen -aie-output-binary=true %s | %python txn2mlir.py | FileCheck %s +// RUN: aie-translate -aie-npu-instgen -aie-output-binary=true %s -o ./roundtrip_npu1_1col_cfg.bin +// RUN: %python txn2mlir.py -f ./roundtrip_npu1_1col_cfg.bin | FileCheck %s // CHECK: aie.device(npu1_1col) // CHECK: memref.global "private" constant @blockwrite_data : memref<2xi32> = dense<[4195328, 0]> diff --git a/test/txn2mlir/roundtrip_npu1_4col.mlir b/test/txn2mlir/roundtrip_npu1_4col.mlir index 4b413acd0a..0059e3e4a4 100644 --- a/test/txn2mlir/roundtrip_npu1_4col.mlir +++ b/test/txn2mlir/roundtrip_npu1_4col.mlir @@ -8,7 +8,8 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-translate -aie-npu-instgen -aie-output-binary=true %s | %python txn2mlir.py | FileCheck %s +// RUN: aie-translate -aie-npu-instgen -aie-output-binary=true %s -o ./roundtrip_npu1_4col_cfg.bin +// RUN: %python txn2mlir.py -f ./roundtrip_npu1_4col_cfg.bin | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK: aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 1 : ui32, value = 1 : ui32}