From 67b26ffec1814c8ce9f552bc3faf0269559e12be Mon Sep 17 00:00:00 2001 From: Zhewen Yu <42230979+Yu-Zhewen@users.noreply.github.com> Date: Thu, 30 May 2024 02:15:36 +0100 Subject: [PATCH] Matmul cascade (#1465) --- .../AIE/Transforms/AIECreatePacketFlows.cpp | 34 +- programming_examples/utils/parse_trace.py | 64 +- .../trace_packet_routing.mlir | 28 + .../README.md | 38 + .../aie_bufferx4.mlir | 648 ++++++++++++++++++ .../aie_cascadex4.mlir | 496 ++++++++++++++ .../aie_plainx1.mlir | 201 ++++++ .../aie_plainx4.mlir | 532 ++++++++++++++ .../matrix_multiplication.h | 316 +++++++++ .../matrix_multiplication_using_cascade/mm.cc | 89 +++ .../run.lit | 19 + .../test.cpp | 231 +++++++ 12 files changed, 2681 insertions(+), 15 deletions(-) create mode 100644 test/create-packet-flows/trace_packet_routing.mlir create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/README.md create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/mm.cc create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/run.lit create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/test.cpp diff --git a/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp b/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp index e42e498e3c..d86e87cc9a 100644 --- a/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp +++ b/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp @@ -157,10 +157,17 @@ void updateCoordinates(int &xCur, int &yCur, WireBundle move) { // Build a packet-switched route from the sourse to the destination with the // given ID. The route is recorded in the given map of switchboxes. void buildPSRoute( - int xSrc, int ySrc, Port sourcePort, int xDest, int yDest, Port destPort, - int flowID, + TileOp srcTile, Port sourcePort, TileOp destTile, Port destPort, int flowID, DenseMap, 8>> &switchboxes, bool reverseOrder = false) { + + int xSrc = srcTile.colIndex(); + int ySrc = srcTile.rowIndex(); + int xDest = destTile.colIndex(); + int yDest = destTile.rowIndex(); + + const auto &targetModel = getTargetModel(srcTile); + int xCur = xSrc; int yCur = ySrc; WireBundle curBundle = {}; @@ -213,6 +220,13 @@ void buildPSRoute( if (move == lastBundle) continue; + // If the source port is a trace port, we need to validate the destination + if (xCur == xSrc && yCur == ySrc && + sourcePort.bundle == WireBundle::Trace && + !targetModel.isValidTraceMaster(xSrc, ySrc, move, curChannel)) { + continue; + } + updateCoordinates(xCur, yCur, move); if (std::find(congestion.begin(), congestion.end(), TileID{xCur, yCur}) != @@ -320,22 +334,18 @@ struct AIERoutePacketFlowsPass Region &r = pktflow.getPorts(); Block &b = r.front(); int flowID = pktflow.IDInt(); - int xSrc = 0, ySrc = 0; - Port sourcePort; + Port sourcePort, destPort; + TileOp srcTile, destTile; for (Operation &Op : b.getOperations()) { if (auto pktSource = dyn_cast(Op)) { - auto srcTile = dyn_cast(pktSource.getTile().getDefiningOp()); - xSrc = srcTile.colIndex(); - ySrc = srcTile.rowIndex(); + srcTile = dyn_cast(pktSource.getTile().getDefiningOp()); sourcePort = pktSource.port(); } else if (auto pktDest = dyn_cast(Op)) { - auto destTile = dyn_cast(pktDest.getTile().getDefiningOp()); - int xDest = destTile.colIndex(); - int yDest = destTile.rowIndex(); - Port destPort = pktDest.port(); + destTile = dyn_cast(pktDest.getTile().getDefiningOp()); + destPort = pktDest.port(); - buildPSRoute(xSrc, ySrc, sourcePort, xDest, yDest, destPort, flowID, + buildPSRoute(srcTile, sourcePort, destTile, destPort, flowID, switchboxes, true); // Assign "keep_pkt_header flag" diff --git a/programming_examples/utils/parse_trace.py b/programming_examples/utils/parse_trace.py index 9d2cd144a6..23078ca9ad 100755 --- a/programming_examples/utils/parse_trace.py +++ b/programming_examples/utils/parse_trace.py @@ -702,8 +702,42 @@ def parse_mlir_trace_events(lines): pid_events[1][key][5] = (value >> 8) & 0xFF pid_events[1][key][6] = (value >> 16) & 0xFF pid_events[1][key][7] = (value >> 24) & 0xFF - - # TODO intfc and memtile event 0, 1 needs to also be defined + # memtile event 0 + elif address == 0x940E0: # 606432 + if pid_events[3].get(key) == None: + pid_events[3][key] = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] # TODO no better way to init this? + # print("Trace event 0 configured to be ",hex(value)) + pid_events[3][key][0] = value & 0xFF + pid_events[3][key][1] = (value >> 8) & 0xFF + pid_events[3][key][2] = (value >> 16) & 0xFF + pid_events[3][key][3] = (value >> 24) & 0xFF + # memtile event 1 + elif address == 0x940E4: # 606436 + if pid_events[3].get(key) == None: + pid_events[3][key] = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] # TODO no better way to init this? + pid_events[3][key][4] = value & 0xFF + pid_events[3][key][5] = (value >> 8) & 0xFF + pid_events[3][key][6] = (value >> 16) & 0xFF + pid_events[3][key][7] = (value >> 24) & 0xFF + # TODO intfc event 0, 1 needs to also be defined # print("Found labels:\n") # for j in pid_events: @@ -750,7 +784,9 @@ def lookup_event_name_by_type(trace_type, code): # Mem traces elif trace_type == 1: # TODO Need to define these - if code == 21: # x15 + if code == 0x1: + event = "True" + elif code == 21: # x15 event = "DMA s2mm 0 start bd" elif code == 22: # x16 event = "DMA s2mm 1 start bd" @@ -780,6 +816,28 @@ def lookup_event_name_by_type(trace_type, code): event = "DMA s2mm 1 stalled lock acquire" else: event = "Unknown" + # memtile traces + elif trace_type == 3: + if code == 0x1: + event = "True" + elif code == 80: # 0x50 + event = "PortRunning0" + elif code == 84: # 0x54 + event = "PortRunning1" + elif code == 88: # 0x58 + event = "PortRunning2" + elif code == 92: # 0x5C + event = "PortRunning3" + elif code == 96: # 0x60 + event = "PortRunning4" + elif code == 100: # 0x64 + event = "PortRunning5" + elif code == 104: # 0x68 + event = "PortRunning6" + elif code == 108: # 0x6C + event = "PortRunning7" + else: + event = "Unknown" else: event = "Unknown" return event diff --git a/test/create-packet-flows/trace_packet_routing.mlir b/test/create-packet-flows/trace_packet_routing.mlir new file mode 100644 index 0000000000..4353244417 --- /dev/null +++ b/test/create-packet-flows/trace_packet_routing.mlir @@ -0,0 +1,28 @@ +//===- trace_packet_routing.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// +// REQUIRES: ryzen_ai, chess + +// RUN: aie-opt --aie-create-packet-flows %s | FileCheck %s +// CHECK-LABEL: module @trace_packet_routing { + +module @trace_packet_routing { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_1_0 = aie.tile(1, 0) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + + aie.packet_flow(0) { + aie.packet_source<%tile_0_2, Trace : 0> // core trace + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(1) { + aie.packet_source<%tile_0_3, Trace : 0> // core trace + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + } +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/README.md b/test/npu-xrt/matrix_multiplication_using_cascade/README.md new file mode 100644 index 0000000000..130362794f --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/README.md @@ -0,0 +1,38 @@ + + +## MM Cascade Design Example +This is a matrix multiply example with the sizes of (16 * 16) * (16 * 16) and i32 data type, where four different versions are compared to examine the possibility of distributing K dim accross multiple cores. + +### Plainx1 Version
+Generated from IREE end-to-end flow, using one core only. + +### Plainx4 Version
+Using four cores, as output stationary + +### Bufferx4 Version
+With four cores chained horizontally, the intermediate accumulations are passed through shared buffers implemented as ObjectFIFO. + +### Cascadex4 Version
+Still having four cores but the intermediate accumulations are communicated through the cascade port. + +### Results
+From the trace files, + +| | Total | Init | Compute | +|-----------|--------|-------|---------| +| Plainx1 | 25.6us | 7.6us | 18.0us | +| Plainx4 | 6.7us | 2.0us | 4.7us | +| Bufferx4 | 32.0us | 7.6us | 24.4us | +| Cascadex4 | 13.9us | 7.6us | 6.3us | + +The Buffer version is slow because of frequent lock-related operations. + +The Cascade version almost halves the latency but with 4x cores. The performance gain is constrained by the initialization time of the accumulation buffer (depends on MxN only). diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir new file mode 100644 index 0000000000..3112c0c05e --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir @@ -0,0 +1,648 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_4col) { + // + func.func private @flush_trace() + func.func private @event_0() + func.func private @event_1() + // + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_1_1 = aie.tile(1, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_1_2 = aie.tile(1, 2) + %tile_2_2 = aie.tile(2, 2) + %tile_3_2 = aie.tile(3, 2) + // + %tile_1_0 = aie.tile(1, 0) + %tile_2_0 = aie.tile(2, 0) + %tile_3_0 = aie.tile(3, 0) + // + %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 4 : i32} + %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%tile_0_1, 3) {init = 4 : i32} + %lock_0_1_1 = aie.lock(%tile_0_1, 2) {init = 0 : i32} + %lock_0_1_2 = aie.lock(%tile_0_1, 1) {init = 1 : i32} + %lock_0_1_3 = aie.lock(%tile_0_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_1_2 = aie.lock(%tile_1_2, 3) {init = 1 : i32} + %lock_1_2_7 = aie.lock(%tile_1_2, 2) {init = 0 : i32} + %lock_1_2_8 = aie.lock(%tile_1_2, 1) {init = 1 : i32} + %lock_1_2_9 = aie.lock(%tile_1_2, 0) {init = 0 : i32} + %lock_2_2 = aie.lock(%tile_2_2, 3) {init = 1 : i32} + %lock_2_2_10 = aie.lock(%tile_2_2, 2) {init = 0 : i32} + %lock_2_2_11 = aie.lock(%tile_2_2, 1) {init = 1 : i32} + %lock_2_2_12 = aie.lock(%tile_2_2, 0) {init = 0 : i32} + %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32} + %lock_3_2_13 = aie.lock(%tile_3_2, 4) {init = 0 : i32} + %lock_3_2_14 = aie.lock(%tile_3_2, 3) {init = 1 : i32} + %lock_3_2_15 = aie.lock(%tile_3_2, 2) {init = 0 : i32} + %lock_3_2_16 = aie.lock(%tile_3_2, 1) {init = 1 : i32} + %lock_3_2_17 = aie.lock(%tile_3_2, 0) {init = 0 : i32} + %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> + %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> + %buf12 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> + %buf11 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<1x4x4x4xi32, 2 : i32> + %buf10 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<4x1x4x4xi32, 2 : i32> + %buf9 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<4x4x4x4xi32, 2 : i32> + %buf8 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<1x4x4x4xi32, 2 : i32> + %buf7 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<4x1x4x4xi32, 2 : i32> + %buf6 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<4x4x4x4xi32, 2 : i32> + %buf5 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<1x4x4x4xi32, 2 : i32> + %buf4 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<4x1x4x4xi32, 2 : i32> + %buf3 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<4x4x4x4xi32, 2 : i32> + %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x4xi32, 2 : i32> + %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x4x4xi32, 2 : i32> + %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> + aie.objectfifo @of0 (%tile_0_2, {%tile_1_2}, 1 : i32) : !aie.objectfifo> + aie.objectfifo @of1 (%tile_1_2, {%tile_2_2}, 1 : i32) : !aie.objectfifo> + aie.objectfifo @of2 (%tile_2_2, {%tile_3_2}, 1 : i32) : !aie.objectfifo> + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_0_2_4, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1 step %c1 { + %subview = memref.subview %buf2[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %subview_8 = memref.subview %buf1[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %subview_9 = memref.subview %buf0[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + // + func.call @event_0() : () -> () + // + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c4 step %c1 { + scf.for %arg5 = %c0 to %c4 step %c1 { + %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %3 = arith.muli %0, %1 : i32 + %4 = arith.addi %2, %3 : i32 + memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + } + %cond0 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond0 { + %5 = aie.objectfifo.acquire @of0 (Produce, 1) : !aie.objectfifosubview> + %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<1xi32> + %7 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + memref.store %7, %6[%c0] : memref<1xi32> + aie.objectfifo.release @of0 (Produce, 1) + } + } + } + // + func.call @event_1() : () -> () + // + } + } + } + aie.use_lock(%lock_0_2_5, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_0_2.elf",link_with = "mm.o"} + %mem_1_2 = aie.mem(%tile_1_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_8, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_1_2_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_1_2_7, Release, 1) + aie.next_bd ^bb4 + } + %core_1_2 = aie.core(%tile_1_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1) + aie.use_lock(%lock_1_2_7, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1 step %c1 { + %subview = memref.subview %buf5[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %subview_8 = memref.subview %buf4[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %subview_9 = memref.subview %buf3[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + // + func.call @event_0() : () -> () + // + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c4 step %c1 { + %cond0 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond0 { + %5 = aie.objectfifo.acquire @of0 (Consume, 1) : !aie.objectfifosubview> + %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<1xi32> + %7 = memref.load %6[%c0] : memref<1xi32> + memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + aie.objectfifo.release @of0 (Consume, 1) + } + scf.for %arg5 = %c0 to %c4 step %c1 { + %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %3 = arith.muli %0, %1 : i32 + %4 = arith.addi %2, %3 : i32 + memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + } + %cond1 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond1 { + %8 = aie.objectfifo.acquire @of1 (Produce, 1) : !aie.objectfifosubview> + %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview> -> memref<1xi32> + %10 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + memref.store %10, %9[%c0] : memref<1xi32> + aie.objectfifo.release @of1 (Produce, 1) + } + } + } + // + func.call @event_1() : () -> () + // + } + } + } + aie.use_lock(%lock_1_2_8, Release, 1) + aie.use_lock(%lock_1_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_1_2.elf",link_with = "mm.o"} + %mem_2_2 = aie.mem(%tile_2_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_11, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_2_2_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_2_2_10, Release, 1) + aie.next_bd ^bb4 + } + %core_2_2 = aie.core(%tile_2_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_2_2_10, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1 step %c1 { + %subview = memref.subview %buf8[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %subview_8 = memref.subview %buf7[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %subview_9 = memref.subview %buf6[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + // + func.call @event_0() : () -> () + // + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c4 step %c1 { + %cond0 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond0 { + %5 = aie.objectfifo.acquire @of1 (Consume, 1) : !aie.objectfifosubview> + %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<1xi32> + %7 = memref.load %6[%c0] : memref<1xi32> + memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + aie.objectfifo.release @of1 (Consume, 1) + } + scf.for %arg5 = %c0 to %c4 step %c1 { + %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %3 = arith.muli %0, %1 : i32 + %4 = arith.addi %2, %3 : i32 + memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + } + %cond1 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond1 { + %8 = aie.objectfifo.acquire @of2 (Produce, 1) : !aie.objectfifosubview> + %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview> -> memref<1xi32> + %10 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + memref.store %10, %9[%c0] : memref<1xi32> + aie.objectfifo.release @of2 (Produce, 1) + } + } + } + // + func.call @event_1() : () -> () + // + } + } + } + aie.use_lock(%lock_2_2_11, Release, 1) + aie.use_lock(%lock_2_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_2_2.elf",link_with = "mm.o"} + %mem_3_2 = aie.mem(%tile_3_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_14, AcquireGreaterEqual, 1) + aie.dma_bd(%buf11 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_3_2_15, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf10 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_3_2_13, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_3_2_17, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [, , ]) + aie.use_lock(%lock_3_2_16, Release, 1) + aie.next_bd ^bb6 + } + %core_3_2 = aie.core(%tile_3_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_16, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_15, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_13, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1 step %c1 { + %subview = memref.subview %buf11[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %subview_8 = memref.subview %buf10[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %subview_9 = memref.subview %buf9[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + // + func.call @event_0() : () -> () + // + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c4 step %c1 { + %cond0 = arith.cmpi "eq", %arg2, %c0 : index + scf.if %cond0 { + %5 = aie.objectfifo.acquire @of2 (Consume, 1) : !aie.objectfifosubview> + %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<1xi32> + %7 = memref.load %6[%c0] : memref<1xi32> + memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + aie.objectfifo.release @of2 (Consume, 1) + } + scf.for %arg5 = %c0 to %c4 step %c1 { + %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32> + %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + %3 = arith.muli %0, %1 : i32 + %4 = arith.addi %2, %3 : i32 + memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32> + } + } + } + // + func.call @event_1() : () -> () + // + } + } + } + aie.use_lock(%lock_3_2_17, Release, 1) + aie.use_lock(%lock_3_2_14, Release, 1) + aie.use_lock(%lock_3_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"} + aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 2, %tile_2_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 3, %tile_3_2, DMA : 0) + aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 2, %tile_2_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 3, %tile_3_2, DMA : 1) + aie.flow(%tile_3_2, DMA : 0, %tile_0_1, DMA : 1) + aie.flow(%tile_0_1, DMA : 4, %tile_0_0, DMA : 0) + aie.cascade_flow(%tile_0_2, %tile_1_2) + aie.cascade_flow(%tile_1_2, %tile_2_2) + aie.cascade_flow(%tile_2_2, %tile_3_2) + // + aie.packet_flow(0) { + aie.packet_source<%tile_0_2, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(1) { + aie.packet_source<%tile_1_2, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(2) { + aie.packet_source<%tile_2_2, Trace : 0> + aie.packet_dest<%tile_2_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(3) { + aie.packet_source<%tile_3_2, Trace : 0> + aie.packet_dest<%tile_3_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(4) { + aie.packet_source<%tile_0_1, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(5) { + aie.packet_source<%tile_1_1, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + // + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb13, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_3, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb5 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb7 + %3 = aie.dma_start(MM2S, 1, ^bb8, ^bb5, repeat_count = 1) + ^bb8: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 4, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb9 + %4 = aie.dma_start(MM2S, 2, ^bb10, ^bb7, repeat_count = 1) + ^bb10: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 8, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb10 + ^bb11: // pred: ^bb0 + %5 = aie.dma_start(MM2S, 3, ^bb12, ^bb9, repeat_count = 1) + ^bb12: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 12, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb12 + ^bb13: // pred: ^bb0 + %6 = aie.dma_start(MM2S, 4, ^bb14, ^bb11, repeat_count = 1) + ^bb14: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_2, Release, 1) + aie.next_bd ^bb14 + } + %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_1_1_0, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb7 + %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 64, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb9 + %3 = aie.dma_start(MM2S, 2, ^bb8, ^bb5, repeat_count = 1) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 128, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb0 + %4 = aie.dma_start(MM2S, 3, ^bb10, ^bb7, repeat_count = 1) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 192, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0) + memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) + memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32> + func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) { + // + aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3 + aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2 + aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1 + aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0 + aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 6735 : ui32} // events:0x00 00 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5 + aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run) + aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) + aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4 + aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run) + aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) + aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} + + aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1) + + // + memref.assume_alignment %arg0, 64 : memref<16x16xi32> + memref.assume_alignment %arg1, 64 : memref<16x16xi32> + memref.assume_alignment %arg2, 64 : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + } {sym_name = "segment_0"} +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir new file mode 100644 index 0000000000..fb58fa0fb0 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir @@ -0,0 +1,496 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_4col) { + func.func private @matmul_scalar_put_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) + func.func private @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) + func.func private @matmul_scalar_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) + // + func.func private @event_0() + func.func private @event_1() + func.func private @flush_trace() + // + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_1_1 = aie.tile(1, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_1_2 = aie.tile(1, 2) + %tile_2_2 = aie.tile(2, 2) + %tile_3_2 = aie.tile(3, 2) + // + %tile_1_0 = aie.tile(1, 0) + %tile_2_0 = aie.tile(2, 0) + %tile_3_0 = aie.tile(3, 0) + // + %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 4 : i32} + %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%tile_0_1, 3) {init = 4 : i32} + %lock_0_1_1 = aie.lock(%tile_0_1, 2) {init = 0 : i32} + %lock_0_1_2 = aie.lock(%tile_0_1, 1) {init = 1 : i32} + %lock_0_1_3 = aie.lock(%tile_0_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_1_2 = aie.lock(%tile_1_2, 3) {init = 1 : i32} + %lock_1_2_7 = aie.lock(%tile_1_2, 2) {init = 0 : i32} + %lock_1_2_8 = aie.lock(%tile_1_2, 1) {init = 1 : i32} + %lock_1_2_9 = aie.lock(%tile_1_2, 0) {init = 0 : i32} + %lock_2_2 = aie.lock(%tile_2_2, 3) {init = 1 : i32} + %lock_2_2_10 = aie.lock(%tile_2_2, 2) {init = 0 : i32} + %lock_2_2_11 = aie.lock(%tile_2_2, 1) {init = 1 : i32} + %lock_2_2_12 = aie.lock(%tile_2_2, 0) {init = 0 : i32} + %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32} + %lock_3_2_13 = aie.lock(%tile_3_2, 4) {init = 0 : i32} + %lock_3_2_14 = aie.lock(%tile_3_2, 3) {init = 1 : i32} + %lock_3_2_15 = aie.lock(%tile_3_2, 2) {init = 0 : i32} + %lock_3_2_16 = aie.lock(%tile_3_2, 1) {init = 1 : i32} + %lock_3_2_17 = aie.lock(%tile_3_2, 0) {init = 0 : i32} + %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> + %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> + %buf12 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> + %buf11 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<1x4x4x4xi32, 2 : i32> + %buf10 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<4x1x4x4xi32, 2 : i32> + %buf9 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<4x4x4x4xi32, 2 : i32> + %buf8 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<1x4x4x4xi32, 2 : i32> + %buf7 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<4x1x4x4xi32, 2 : i32> + %buf6 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<4x4x4x4xi32, 2 : i32> + %buf5 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<1x4x4x4xi32, 2 : i32> + %buf4 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<4x1x4x4xi32, 2 : i32> + %buf3 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<4x4x4x4xi32, 2 : i32> + %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x4xi32, 2 : i32> + %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x4x4xi32, 2 : i32> + %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_0_2_4, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_put_4x1x4_4x4x4_i32_i32(%buf2, %buf1, %buf0) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_0_2_5, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_0_2.elf",link_with = "mm.o"} + %mem_1_2 = aie.mem(%tile_1_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_8, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_1_2_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_1_2_7, Release, 1) + aie.next_bd ^bb4 + } + %core_1_2 = aie.core(%tile_1_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1) + aie.use_lock(%lock_1_2_7, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(%buf5, %buf4, %buf3) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_1_2_8, Release, 1) + aie.use_lock(%lock_1_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_1_2.elf",link_with = "mm.o"} + %mem_2_2 = aie.mem(%tile_2_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_11, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_2_2_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_2_2_10, Release, 1) + aie.next_bd ^bb4 + } + %core_2_2 = aie.core(%tile_2_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_2_2_10, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(%buf8, %buf7, %buf6) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_2_2_11, Release, 1) + aie.use_lock(%lock_2_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_2_2.elf",link_with = "mm.o"} + %mem_3_2 = aie.mem(%tile_3_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_14, AcquireGreaterEqual, 1) + aie.dma_bd(%buf11 : memref<1x4x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_3_2_15, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf10 : memref<4x1x4x4xi32, 2 : i32>, 0, 64) + aie.use_lock(%lock_3_2_13, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_3_2_17, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [, , ]) + aie.use_lock(%lock_3_2_16, Release, 1) + aie.next_bd ^bb6 + } + %core_3_2 = aie.core(%tile_3_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_16, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_15, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_13, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_get_4x1x4_4x4x4_i32_i32(%buf11, %buf10, %buf9) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_3_2_17, Release, 1) + aie.use_lock(%lock_3_2_14, Release, 1) + aie.use_lock(%lock_3_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"} + aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 2, %tile_2_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 3, %tile_3_2, DMA : 0) + aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 2, %tile_2_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 3, %tile_3_2, DMA : 1) + aie.flow(%tile_3_2, DMA : 0, %tile_0_1, DMA : 1) + aie.flow(%tile_0_1, DMA : 4, %tile_0_0, DMA : 0) + aie.cascade_flow(%tile_0_2, %tile_1_2) + aie.cascade_flow(%tile_1_2, %tile_2_2) + aie.cascade_flow(%tile_2_2, %tile_3_2) + // + aie.packet_flow(0) { + aie.packet_source<%tile_0_2, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(1) { + aie.packet_source<%tile_1_2, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(2) { + aie.packet_source<%tile_2_2, Trace : 0> + aie.packet_dest<%tile_2_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(3) { + aie.packet_source<%tile_3_2, Trace : 0> + aie.packet_dest<%tile_3_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(4) { + aie.packet_source<%tile_0_1, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(5) { + aie.packet_source<%tile_1_1, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + // + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb13, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_3, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb5 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb7 + %3 = aie.dma_start(MM2S, 1, ^bb8, ^bb5, repeat_count = 1) + ^bb8: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 4, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb9 + %4 = aie.dma_start(MM2S, 2, ^bb10, ^bb7, repeat_count = 1) + ^bb10: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 8, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb10 + ^bb11: // pred: ^bb0 + %5 = aie.dma_start(MM2S, 3, ^bb12, ^bb9, repeat_count = 1) + ^bb12: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 12, 64, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb12 + ^bb13: // pred: ^bb0 + %6 = aie.dma_start(MM2S, 4, ^bb14, ^bb11, repeat_count = 1) + ^bb14: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_2, Release, 1) + aie.next_bd ^bb14 + } + %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_1_1_0, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb7 + %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 64, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb9 + %3 = aie.dma_start(MM2S, 2, ^bb8, ^bb5, repeat_count = 1) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 128, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb0 + %4 = aie.dma_start(MM2S, 3, ^bb10, ^bb7, repeat_count = 1) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 192, 64, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0) + memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) + memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32> + func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) { + // + aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3 + aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2 + aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1 + aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0 + aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5 + aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run) + aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) + aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4 + aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run) + aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) + aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} + + aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1) + + // + memref.assume_alignment %arg0, 64 : memref<16x16xi32> + memref.assume_alignment %arg1, 64 : memref<16x16xi32> + memref.assume_alignment %arg2, 64 : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + } {sym_name = "segment_0"} +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir new file mode 100644 index 0000000000..e6d4d7df97 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir @@ -0,0 +1,201 @@ +module { + aie.device(npu1_4col) { + func.func private @matmul_scalar_4x2x4_4x8x4_i32_i32(memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) + // + func.func private @event_0() + func.func private @event_1() + func.func private @flush_trace() + // + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_1_1 = aie.tile(1, 1) + %tile_2_1 = aie.tile(2, 1) + %tile_0_2 = aie.tile(0, 2) + %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 1 : i32} + %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%tile_0_1, 1) {init = 1 : i32} + %lock_0_1_1 = aie.lock(%tile_0_1, 0) {init = 0 : i32} + %lock_2_1 = aie.lock(%tile_2_1, 1) {init = 1 : i32} + %lock_2_1_2 = aie.lock(%tile_2_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %buf5 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x16xi32, 1 : i32> + %buf4 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x16xi32, 1 : i32> + %buf3 = aie.buffer(%tile_2_1) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x16xi32, 1 : i32> + %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x4x4x8xi32, 2 : i32> + %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x2x8x4xi32, 2 : i32> + %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<2x4x4x8xi32, 2 : i32>, 0, 256) + aie.use_lock(%lock_0_2_5, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<4x2x8x4xi32, 2 : i32>, 0, 256) + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [, , ]) + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb6 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c4 step %c1 { + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_4x2x4_4x8x4_i32_i32(%buf2, %buf1, %buf0) : (memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_0_2_7, Release, 1) + aie.use_lock(%lock_0_2_4, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_0_2.elf", link_with = "mm.o"} + aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0) + aie.flow(%tile_2_1, DMA : 0, %tile_0_0, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1) + aie.flow(%tile_0_2, DMA : 0, %tile_2_1, DMA : 0) + // + aie.packet_flow(0) { + aie.packet_source<%tile_0_2, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(4) { + aie.packet_source<%tile_0_1, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + // + %memtile_dma_2_1 = aie.memtile_dma(%tile_2_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_2_1, Release, 1) + aie.next_bd ^bb4 + } + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<16x16xi32, 1 : i32>, 0, 256, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + } + %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_1_1_0, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<16x16xi32, 1 : i32>, 0, 256, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + } + aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0) + memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) + memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32> + func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) { + // + aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0 + aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4 + aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run) + aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) + aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} + + aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1) + // + memref.assume_alignment %arg0, 64 : memref<16x16xi32> + memref.assume_alignment %arg1, 64 : memref<16x16xi32> + memref.assume_alignment %arg2, 64 : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + } {sym_name = "segment_0"} +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir new file mode 100644 index 0000000000..6ffc1cfda2 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir @@ -0,0 +1,532 @@ +module { + aie.device(npu1_4col) { + func.func private @matmul_scalar_2x2x2_4x8x4_i32_i32(memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) + // + func.func private @event_0() + func.func private @event_1() + func.func private @flush_trace() + // + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_1_1 = aie.tile(1, 1) + %tile_2_1 = aie.tile(2, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_1_2 = aie.tile(1, 2) + %tile_2_2 = aie.tile(2, 2) + %tile_3_2 = aie.tile(3, 2) + // + %tile_1_0 = aie.tile(1, 0) + %tile_2_0 = aie.tile(2, 0) + %tile_3_0 = aie.tile(3, 0) + // + %lock_2_1 = aie.lock(%tile_2_1, 1) {init = 4 : i32} + %lock_2_1_0 = aie.lock(%tile_2_1, 0) {init = 0 : i32} + %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 2 : i32} + %lock_1_1_1 = aie.lock(%tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%tile_0_1, 1) {init = 2 : i32} + %lock_0_1_2 = aie.lock(%tile_0_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 4) {init = 0 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_7 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_8 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_1_2 = aie.lock(%tile_1_2, 5) {init = 1 : i32} + %lock_1_2_9 = aie.lock(%tile_1_2, 4) {init = 0 : i32} + %lock_1_2_10 = aie.lock(%tile_1_2, 3) {init = 1 : i32} + %lock_1_2_11 = aie.lock(%tile_1_2, 2) {init = 0 : i32} + %lock_1_2_12 = aie.lock(%tile_1_2, 1) {init = 1 : i32} + %lock_1_2_13 = aie.lock(%tile_1_2, 0) {init = 0 : i32} + %lock_2_2 = aie.lock(%tile_2_2, 5) {init = 1 : i32} + %lock_2_2_14 = aie.lock(%tile_2_2, 4) {init = 0 : i32} + %lock_2_2_15 = aie.lock(%tile_2_2, 3) {init = 1 : i32} + %lock_2_2_16 = aie.lock(%tile_2_2, 2) {init = 0 : i32} + %lock_2_2_17 = aie.lock(%tile_2_2, 1) {init = 1 : i32} + %lock_2_2_18 = aie.lock(%tile_2_2, 0) {init = 0 : i32} + %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32} + %lock_3_2_19 = aie.lock(%tile_3_2, 4) {init = 0 : i32} + %lock_3_2_20 = aie.lock(%tile_3_2, 3) {init = 1 : i32} + %lock_3_2_21 = aie.lock(%tile_3_2, 2) {init = 0 : i32} + %lock_3_2_22 = aie.lock(%tile_3_2, 1) {init = 1 : i32} + %lock_3_2_23 = aie.lock(%tile_3_2, 0) {init = 0 : i32} + %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> + %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> + %buf12 = aie.buffer(%tile_2_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> + %buf11 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<2x2x4x8xi32, 2 : i32> + %buf10 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<2x2x8x4xi32, 2 : i32> + %buf9 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<2x2x4x4xi32, 2 : i32> + %buf8 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<2x2x4x8xi32, 2 : i32> + %buf7 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<2x2x8x4xi32, 2 : i32> + %buf6 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<2x2x4x4xi32, 2 : i32> + %buf5 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<2x2x4x8xi32, 2 : i32> + %buf4 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<2x2x8x4xi32, 2 : i32> + %buf3 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<2x2x4x4xi32, 2 : i32> + %buf2 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32, 2 : i32> + %buf1 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<2x2x8x4xi32, 2 : i32> + %buf0 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<2x2x4x4xi32, 2 : i32> + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf11 : memref<2x2x4x8xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf10 : memref<2x2x8x4xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_0_2_4, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_2_8, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_0_2_7, Release, 1) + aie.next_bd ^bb6 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c2 step %c1 { + scf.for %arg1 = %c0 to %c2 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf11, %buf10, %buf9) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_0_2_8, Release, 1) + aie.use_lock(%lock_0_2_5, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_0_2.elf", link_with = "mm.o"} + %mem_1_2 = aie.mem(%tile_1_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_10, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<2x2x4x8xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_1_2_11, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<2x2x8x4xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_1_2_9, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_2_13, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_1_2_12, Release, 1) + aie.next_bd ^bb6 + } + %core_1_2 = aie.core(%tile_1_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_2_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_1_2_11, AcquireGreaterEqual, 1) + aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c2 step %c1 { + scf.for %arg1 = %c0 to %c2 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf8, %buf7, %buf6) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_1_2_13, Release, 1) + aie.use_lock(%lock_1_2_10, Release, 1) + aie.use_lock(%lock_1_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_1_2.elf", link_with = "mm.o"} + %mem_2_2 = aie.mem(%tile_2_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_15, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<2x2x4x8xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_2_2_16, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<2x2x8x4xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_2_2_14, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_2_2_18, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_2_2_17, Release, 1) + aie.next_bd ^bb6 + } + %core_2_2 = aie.core(%tile_2_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_2_17, AcquireGreaterEqual, 1) + aie.use_lock(%lock_2_2_16, AcquireGreaterEqual, 1) + aie.use_lock(%lock_2_2_14, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c2 step %c1 { + scf.for %arg1 = %c0 to %c2 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf5, %buf4, %buf3) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_2_2_18, Release, 1) + aie.use_lock(%lock_2_2_15, Release, 1) + aie.use_lock(%lock_2_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_2_2.elf", link_with = "mm.o"} + %mem_3_2 = aie.mem(%tile_3_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_20, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<2x2x4x8xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_3_2_21, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<2x2x8x4xi32, 2 : i32>, 0, 128) + aie.use_lock(%lock_3_2_19, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_3_2_23, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [, , ]) + aie.use_lock(%lock_3_2_22, Release, 1) + aie.next_bd ^bb6 + } + %core_3_2 = aie.core(%tile_3_2) { + %c0_i32 = arith.constant 0 : i32 + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_3_2_22, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_21, AcquireGreaterEqual, 1) + aie.use_lock(%lock_3_2_19, AcquireGreaterEqual, 1) + // + func.call @event_0() : () -> () + // + scf.for %arg0 = %c0 to %c2 step %c1 { + scf.for %arg1 = %c0 to %c2 step %c1 { + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c4 step %c1 { + memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32> + } + } + } + } + // + func.call @event_1() : () -> () + // + func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf2, %buf1, %buf0) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> () + aie.use_lock(%lock_3_2_23, Release, 1) + aie.use_lock(%lock_3_2_20, Release, 1) + aie.use_lock(%lock_3_2, Release, 1) + // + func.call @flush_trace() : () -> () + // + cf.br ^bb1 + } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"} + aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_1_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_2_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_3_2, DMA : 0) + aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 0, %tile_2_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1) + aie.flow(%tile_1_1, DMA : 1, %tile_3_2, DMA : 1) + aie.flow(%tile_0_2, DMA : 0, %tile_2_1, DMA : 0) + aie.flow(%tile_1_2, DMA : 0, %tile_2_1, DMA : 1) + aie.flow(%tile_2_2, DMA : 0, %tile_2_1, DMA : 2) + aie.flow(%tile_3_2, DMA : 0, %tile_2_1, DMA : 3) + aie.flow(%tile_2_1, DMA : 0, %tile_0_0, DMA : 0) + // + aie.packet_flow(0) { + aie.packet_source<%tile_0_2, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(1) { + aie.packet_source<%tile_1_2, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(2) { + aie.packet_source<%tile_2_2, Trace : 0> + aie.packet_dest<%tile_2_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(3) { + aie.packet_source<%tile_3_2, Trace : 0> + aie.packet_dest<%tile_3_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(4) { + aie.packet_source<%tile_0_1, Trace : 0> + aie.packet_dest<%tile_0_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(5) { + aie.packet_source<%tile_1_1, Trace : 0> + aie.packet_dest<%tile_1_0, DMA : 1> + } {keep_pkt_header = true} + aie.packet_flow(6) { + aie.packet_source<%tile_2_1, Trace : 0> + aie.packet_dest<%tile_2_0, DMA : 1> + } {keep_pkt_header = true} + // + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 2) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_0_1_2, Release, 2) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 128, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 128, 128, [, , ]) + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + + } + %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 2) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_1_1_1, Release, 2) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 128, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb0 + %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 8, 128, [, , ]) + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb6 + } + %memtile_dma_2_1 = aie.memtile_dma(%tile_2_1) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 64, [, ]) + aie.use_lock(%lock_2_1_0, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb5 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 8, 64, [, ]) + aie.use_lock(%lock_2_1_0, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb7 + %2 = aie.dma_start(S2MM, 2, ^bb6, ^bb3, repeat_count = 1) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 128, 64, [, ]) + aie.use_lock(%lock_2_1_0, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb9 + %3 = aie.dma_start(S2MM, 3, ^bb8, ^bb6, repeat_count = 1) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 136, 64, [, ]) + aie.use_lock(%lock_2_1_0, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb0 + %4 = aie.dma_start(MM2S, 0, ^bb10, ^bb7, repeat_count = 1) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_2_1_0, AcquireGreaterEqual, 4) + aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256) + aie.use_lock(%lock_2_1, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0) + memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0) + memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32> + aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0) + memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32> + func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) { + // + aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3 + aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2 + aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1 + aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} + + aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15) + aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0 + aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true) + aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) + aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 + aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} + aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 2 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 2 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 2 : i32, row = 1 : i32, value = 12294 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 6 + aiex.npu.write32 {address = 606432 : ui32, column = 2 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run) + aiex.npu.write32 {address = 606436 : ui32, column = 2 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) + aiex.npu.write32 {address = 724736 : ui32, column = 2 : i32, row = 1 : i32, value = 589439264 : ui32} // [29:24] port3 S2MM-3, [21:16] port2 S2MM-2, [13:8] port1 S2MM-1, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 2 : i32, row = 1 : i32, value = 0 : ui32} // [5:0] port4 MM2S-0 + aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 11 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5 + aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run) + aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) + aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 10 : ui32} + + aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15) + aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4 + aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run) + aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) + aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 + aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 + aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 9 : ui32} + + aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1) + aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1) + + // + memref.assume_alignment %arg0, 64 : memref<16x16xi32> + memref.assume_alignment %arg1, 64 : memref<16x16xi32> + memref.assume_alignment %arg2, 64 : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + } {sym_name = "segment_0"} +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h b/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h new file mode 100644 index 0000000000..9dbe9bd203 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h @@ -0,0 +1,316 @@ +//===- matrix_multiplication.h ----------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// This file contains common helper functions for the matrix multiplication +// host code, such as verifying and printing matrices. + +#ifndef MATRIX_MULTIPLICATION_H +#define MATRIX_MULTIPLICATION_H + +#include +#include + +namespace matmul_common { + +namespace po = boost::program_options; + +// -------------------------------------------------------------------------- +// Command Line Argument Handling +// -------------------------------------------------------------------------- + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +void add_default_options(po::options_description &desc) { + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions sent to the NPU")( + "verify", po::value()->default_value(true), + "whether to verify the AIE computed output")( + "iters", po::value()->default_value(1))( + "warmup", po::value()->default_value(0))( + "trace_sz,t", po::value()->default_value(0))( + "trace_file", po::value()->default_value("trace.txt"), + "where to store trace output"); +} + +void parse_options(int argc, const char *argv[], po::options_description &desc, + po::variables_map &vm) { + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + std::exit(1); + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + std::exit(1); + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); +} + +// -------------------------------------------------------------------------- +// AIE Specifics +// -------------------------------------------------------------------------- + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +// -------------------------------------------------------------------------- +// Matrix / Float / Math +// -------------------------------------------------------------------------- + +static inline std::int16_t random_int16_t() { + return (std::int16_t)rand() % 0x10000; +} + +static inline std::int32_t random_int32_t() { + return (std::int32_t)rand() % 0x10000; +} + +static inline std::bfloat16_t random_bfloat16_t() { + // Random numbers should NOT be uniformly between 0 and 1, because that + // would make the matrix product AB always close to 1. + return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); +} + +template +void matmul_naive(int M, int N, int K, const std::vector A, + const std::vector B, std::vector &C) { + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + Tout running_sum = 0; + for (int k = 0; k < K; k++) { + running_sum += Tout(A[row * K + k] * B[k * N + col]); + } + C[row * N + col] = Tout(running_sum); + } + } +} + +const int K_block_size = 64; + +template +void matmul(int M, int N, int K, const std::vector A, + const std::vector B, std::vector &C) { + // A is an MxK matrix + // B is a KxN matrix + // C is the MxN output matrix, assumed to be zeroed out + + const int n_K_blocks = K / K_block_size; + assert(K % K_block_size == 0 && "K must be divisible by K_block_size"); + + const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop + with this const variable. B does not get + resized, so the pointer remains valid. */ + + const Tin *A_base = A.data(); /* Points to start of current row of A, + monotonically increasing by K. */ + const Tin *B_base = B_origin; /* Points to start of current column of B; + increases by 1 in each inner loop, resets + to B_origin (0) at the start of a new row + (outer loop). */ + + const Tin *A_ptr = A_base; + const Tin *B_ptr = B_base; + Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */ + + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + A_ptr = A_base; + B_ptr = B_base; + Tout running_sum = 0; + for (int k = 0; k < n_K_blocks; k++) { + for (int i = 0; i < K_block_size; i++) { + running_sum += Tout(*A_ptr) * Tout(*B_ptr); + A_ptr += 1; // Advance to right neighbor; next value in this row + B_ptr += N; // Advance to bottom neighbor; next value in this column + } + } + *C_ptr = Tout(running_sum); + C_ptr += 1; + B_base += 1; /* Next iteration: same row of A (A_base unchanged), + next column of B (B_base increases by 1) */ + } + A_base += K; // Advance to next row of A + B_base = B_origin; /* Next row of A means we need to restart at the first + column of B. */ + } +} + +// nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0 +// Original author: P-Gn +// Source: https://stackoverflow.com/a/32334103 +bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, + float abs_th = FLT_MIN) +// those defaults are arbitrary and could be removed +{ + assert(std::numeric_limits::epsilon() <= epsilon); + assert(epsilon < 1.f); + + if (a == b) + return true; + + auto diff = std::abs(a - b); + auto norm = + std::min((std::abs(a) + std::abs(b)), std::numeric_limits::max()); + // or even faster: std::min(std::abs(a + b), + // std::numeric_limits::max()); keeping this commented out until I + // update figures below + return diff < std::max(abs_th, epsilon * norm); +} + +template +void print_matrix(const std::vector matrix, int n_cols, + int n_printable_rows = 10, int n_printable_cols = 10, + std::ostream &ostream = std::cout, + const char col_sep[] = " ", const char elide_sym[] = " ... ", + int w = -1) { + assert(matrix.size() % n_cols == 0); + + auto maxima = std::minmax_element(matrix.begin(), matrix.end()); + T max_val = std::max(*maxima.first, std::abs(*maxima.second)); + size_t n_digits = log10(max_val); + if (w == -1) { + w = n_digits; + } + int n_rows = matrix.size() / n_cols; + + n_printable_rows = std::min(n_rows, n_printable_rows); + n_printable_cols = std::min(n_cols, n_printable_cols); + + const bool elide_rows = n_printable_rows < n_rows; + const bool elide_cols = n_printable_cols < n_cols; + + if (elide_rows || elide_cols) { + w = std::max((int)w, (int)strlen(elide_sym)); + } + + w += 3; // for decimal point and two decimal digits + ostream << std::fixed << std::setprecision(2); + +#define print_row(what) \ + for (int col = 0; col < n_printable_cols / 2; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } \ + if (elide_cols) { \ + ostream << std::setw(0) << elide_sym; \ + } \ + for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } + + for (int row = 0; row < n_printable_rows / 2; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + if (elide_rows) { + print_row(elide_sym); + ostream << std::endl; + } + for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + +#undef print_row +} + +template +int verify(int M, int N, int K, std::vector A, std::vector B, + std::vector C) { + int errors = 0; + int max_printable_errors = 500; + const float absTol = 0.5; + const float relTol = 0.5; + + std::vector CRef(M * N); + if (K % K_block_size == 0) { + matmul(M, N, K, A, B, CRef); + } else { + matmul_naive(M, N, K, A, B, CRef); + } + + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + if (!nearly_equal(CRef[row * N + col], C[row * N + col], relTol, + absTol)) { + errors++; + if (errors < max_printable_errors) { + std::cout << "Error in row " << row << ", col " << col << ". " + << "Expected " << std::setw(4) << (float)CRef[row * N + col] + << ", got " << std::setw(4) << (float)C[row * N + col] + << "." << std::endl; + } + } + } + } + + if (errors >= max_printable_errors) { + std::cout << "...and " << std::setw(0) << errors << " further errors." + << std::endl; + } + if (errors > 0) { + std::cout << std::endl << "Reference:" << std::endl; + matmul_common::print_matrix(CRef, N); + std::cout << std::endl << "Output:" << std::endl; + matmul_common::print_matrix(C, N); + } + + return errors; +} + +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { + std::ofstream fout(path); + uint32_t *traceOut = (uint32_t *)traceOutPtr; + for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { + fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; + fout << std::endl; + } +} + +} // namespace matmul_common + +#endif diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc b/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc new file mode 100644 index 0000000000..8a19bcc250 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc @@ -0,0 +1,89 @@ +//===- mm.cc ----------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define __AIENGINE__ 2 +#define NOCPP +#define __AIEARCH__ 20 + +#include +#include +#include + +template +void matmul_scalar_cascade_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + for (int m_t = 0; m_t < M_tile; m_t++) { + for (int n_t = 0; n_t < N_tile; n_t++) { + for (int k_t = 0; k_t < K_tile; k_t++) { + event0(); + int a_offset = (k_t * M_tile + m_t) * (M * K); + int b_offset = (n_t * K_tile + k_t) * (K * N); + int c_offset = (n_t * M_tile + m_t) * (M * N); + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + int32_t running_sum = 0; + if (get && k_t == 0) { + v32int32 v32 = get_scd_v32int32(); + running_sum += ext_elem(v32, 0); + } + for (int k = 0; k < K; k++) { + running_sum += a[a_offset + m * K + k] * b[b_offset + k * N + n]; + } + c[c_offset + m * N + n] += running_sum; + if (put && k_t == K_tile - 1) { + v32int32 v32 = undef_v32int32(); + v32 = upd_elem(v32, 0, c[c_offset + m * N + n]); + put_mcd(v32); + } + } + } + event1(); + } + } + } +} + +extern "C" { + +void matmul_scalar_put_4x1x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_get_4x1x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_put_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_get_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b, + int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_4x2x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} +void matmul_scalar_2x2x2_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) { + matmul_scalar_cascade_i32_i32(a, b, c); +} + +void flush_trace() { + // event buffers only appear to be transferred to DDR in bursts of 256 bytes + // (64 events) + for (int i = 0; i < 32; i++) { + event0(); + event1(); + } +} + +void event_0() { event0(); } +void event_1() { event1(); } +} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/run.lit b/test/npu-xrt/matrix_multiplication_using_cascade/run.lit new file mode 100644 index 0000000000..03d7c862c5 --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/run.lit @@ -0,0 +1,19 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o +// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// +// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_plain.xclbin --npu-insts-name=insts2_plain.txt %S/aie_plainx4.mlir +// RUN: %run_on_npu ./test.exe -x aie2_plain.xclbin -k MLIR_AIE -i insts2_plain.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_PLAIN %s +// CHECK_PLAIN: PASS! +// +// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_buffer.xclbin --npu-insts-name=insts2_buffer.txt %S/aie_bufferx4.mlir +// RUN: %run_on_npu ./test.exe -x aie2_buffer.xclbin -k MLIR_AIE -i insts2_buffer.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_BUFFER %s +// CHECK_BUFFER: PASS! +// +// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_cascade.xclbin --npu-insts-name=insts2_cascade.txt %S/aie_cascadex4.mlir +// RUN: %run_on_npu ./test.exe -x aie2_cascade.xclbin -k MLIR_AIE -i insts2_cascade.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_CASCADE %s +// CHECK_CASCADE: PASS! diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp new file mode 100644 index 0000000000..3b26e0623a --- /dev/null +++ b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp @@ -0,0 +1,231 @@ + +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "matrix_multiplication.h" + +constexpr int M = 16; +constexpr int K = 16; +constexpr int N = 16; + +constexpr int A_VOLUME = M * K; +constexpr int B_VOLUME = N * K; +constexpr int C_VOLUME = M * N; + +using A_DATATYPE = std::int32_t; +using B_DATATYPE = std::int32_t; +using C_DATATYPE = std::int32_t; + +constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE)); +constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE)); +constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE)); + +constexpr bool VERIFY = true; + +namespace po = boost::program_options; + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + po::variables_map vm; + matmul_common::add_default_options(desc); + matmul_common::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int trace_size = vm["trace_sz"].as(); + + srand(time(NULL)); + + std::vector instr_v = + matmul_common::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_a = + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_b = + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + A_DATATYPE *bufA = bo_a.map(); + std::vector AVec(A_VOLUME); + for (int i = 0; i < A_VOLUME; i++) { + AVec[i] = matmul_common::random_int32_t(); + } + memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); + B_DATATYPE *bufB = bo_b.map(); + std::vector BVec(B_VOLUME); + for (int i = 0; i < B_VOLUME; i++) { + BVec[i] = matmul_common::random_int32_t(); + } + memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); + C_DATATYPE *bufC = bo_c.map(); + std::vector CVec(C_VOLUME); + // memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE))); + memset(bufC, 0, C_SIZE + trace_size); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned num_iter = 1; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + float macs = 2.0 * float(M) * float(K) * float(N); + + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + memcpy(CVec.data(), bufC, (CVec.size() * sizeof(C_DATATYPE))); + // std::vector CVecRef(C_VOLUME); + if (VERIFY) { + if (verbosity >= 1) { + std::cout << "Verifying against reference matmul ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + // matmul_common::matmul_naive(M, N, K, AVec, BVec, CVecRef); + errors = matmul_common::verify(M, N, K, AVec, BVec, CVec); + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: matmul results not verified." << std::endl; + } + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + if (trace_size > 0) { + matmul_common::write_out_trace(((char *)bufC) + C_SIZE, trace_size, + vm["trace_file"].as()); + } + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + std::cout << std::endl + << "Avg NPU matmul time: " << npu_time_total / num_iter << "us." + << std::endl; + std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter) + << std::endl; + + std::cout << std::endl + << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl; + + std::cout << std::endl + << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; + + if (VERIFY && !errors) { + std::cout << "\nPASS!\n\n"; + + // Open the CSV file in appending mode + std::ofstream outfile; + outfile.open("../results.csv", std::ios_base::app); + // Write M, N, K, and avg runtime to the CSV file + outfile << M << "," << N << "," << K << "," << npu_time_total / num_iter + << "," << macs / (1000 * npu_time_total / num_iter) << std::endl; + // Close the CSV file + outfile.close(); + + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +}