From 67b26ffec1814c8ce9f552bc3faf0269559e12be Mon Sep 17 00:00:00 2001
From: Zhewen Yu <42230979+Yu-Zhewen@users.noreply.github.com>
Date: Thu, 30 May 2024 02:15:36 +0100
Subject: [PATCH] Matmul cascade (#1465)

---
 .../AIE/Transforms/AIECreatePacketFlows.cpp   |  34 +-
 programming_examples/utils/parse_trace.py     |  64 +-
 .../trace_packet_routing.mlir                 |  28 +
 .../README.md                                 |  38 +
 .../aie_bufferx4.mlir                         | 648 ++++++++++++++++++
 .../aie_cascadex4.mlir                        | 496 ++++++++++++++
 .../aie_plainx1.mlir                          | 201 ++++++
 .../aie_plainx4.mlir                          | 532 ++++++++++++++
 .../matrix_multiplication.h                   | 316 +++++++++
 .../matrix_multiplication_using_cascade/mm.cc |  89 +++
 .../run.lit                                   |  19 +
 .../test.cpp                                  | 231 +++++++
 12 files changed, 2681 insertions(+), 15 deletions(-)
 create mode 100644 test/create-packet-flows/trace_packet_routing.mlir
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/README.md
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/mm.cc
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/run.lit
 create mode 100644 test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
diff --git a/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp b/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp
index e42e498e3c..d86e87cc9a 100644
--- a/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp
+++ b/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp
@@ -157,10 +157,17 @@ void updateCoordinates(int &xCur, int &yCur, WireBundle move) {
 // Build a packet-switched route from the sourse to the destination with the
 // given ID. The route is recorded in the given map of switchboxes.
 void buildPSRoute(
-    int xSrc, int ySrc, Port sourcePort, int xDest, int yDest, Port destPort,
-    int flowID,
+    TileOp srcTile, Port sourcePort, TileOp destTile, Port destPort, int flowID,
     DenseMap<TileID, SmallVector<std::pair<Connect, int>, 8>> &switchboxes,
     bool reverseOrder = false) {
+
+  int xSrc = srcTile.colIndex();
+  int ySrc = srcTile.rowIndex();
+  int xDest = destTile.colIndex();
+  int yDest = destTile.rowIndex();
+
+  const auto &targetModel = getTargetModel(srcTile);
+
   int xCur = xSrc;
   int yCur = ySrc;
   WireBundle curBundle = {};
@@ -213,6 +220,13 @@ void buildPSRoute(
       if (move == lastBundle)
         continue;
 
+      // If the source port is a trace port, we need to validate the destination
+      if (xCur == xSrc && yCur == ySrc &&
+          sourcePort.bundle == WireBundle::Trace &&
+          !targetModel.isValidTraceMaster(xSrc, ySrc, move, curChannel)) {
+        continue;
+      }
+
       updateCoordinates(xCur, yCur, move);
 
       if (std::find(congestion.begin(), congestion.end(), TileID{xCur, yCur}) !=
@@ -320,22 +334,18 @@ struct AIERoutePacketFlowsPass
       Region &r = pktflow.getPorts();
       Block &b = r.front();
       int flowID = pktflow.IDInt();
-      int xSrc = 0, ySrc = 0;
-      Port sourcePort;
+      Port sourcePort, destPort;
+      TileOp srcTile, destTile;
 
       for (Operation &Op : b.getOperations()) {
         if (auto pktSource = dyn_cast<PacketSourceOp>(Op)) {
-          auto srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
-          xSrc = srcTile.colIndex();
-          ySrc = srcTile.rowIndex();
+          srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
           sourcePort = pktSource.port();
         } else if (auto pktDest = dyn_cast<PacketDestOp>(Op)) {
-          auto destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
-          int xDest = destTile.colIndex();
-          int yDest = destTile.rowIndex();
-          Port destPort = pktDest.port();
+          destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
+          destPort = pktDest.port();
 
-          buildPSRoute(xSrc, ySrc, sourcePort, xDest, yDest, destPort, flowID,
+          buildPSRoute(srcTile, sourcePort, destTile, destPort, flowID,
                        switchboxes, true);
 
           // Assign "keep_pkt_header flag"
diff --git a/programming_examples/utils/parse_trace.py b/programming_examples/utils/parse_trace.py
index 9d2cd144a6..23078ca9ad 100755
--- a/programming_examples/utils/parse_trace.py
+++ b/programming_examples/utils/parse_trace.py
@@ -702,8 +702,42 @@ def parse_mlir_trace_events(lines):
                 pid_events[1][key][5] = (value >> 8) & 0xFF
                 pid_events[1][key][6] = (value >> 16) & 0xFF
                 pid_events[1][key][7] = (value >> 24) & 0xFF
-
-            # TODO intfc and memtile event 0, 1 needs to also be defined
+            # memtile event 0
+            elif address == 0x940E0:  # 606432
+                if pid_events[3].get(key) == None:
+                    pid_events[3][key] = [
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                    ]  # TODO no better way to init this?
+                # print("Trace event 0 configured to be ",hex(value))
+                pid_events[3][key][0] = value & 0xFF
+                pid_events[3][key][1] = (value >> 8) & 0xFF
+                pid_events[3][key][2] = (value >> 16) & 0xFF
+                pid_events[3][key][3] = (value >> 24) & 0xFF
+            # memtile event 1
+            elif address == 0x940E4:  # 606436
+                if pid_events[3].get(key) == None:
+                    pid_events[3][key] = [
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                    ]  # TODO no better way to init this?
+                pid_events[3][key][4] = value & 0xFF
+                pid_events[3][key][5] = (value >> 8) & 0xFF
+                pid_events[3][key][6] = (value >> 16) & 0xFF
+                pid_events[3][key][7] = (value >> 24) & 0xFF
+            # TODO intfc event 0, 1 needs to also be defined
 
     # print("Found labels:\n")
     # for j in pid_events:
@@ -750,7 +784,9 @@ def lookup_event_name_by_type(trace_type, code):
     # Mem traces
     elif trace_type == 1:
         # TODO Need to define these
-        if code == 21:  # x15
+        if code == 0x1:
+            event = "True"
+        elif code == 21:  # x15
             event = "DMA s2mm 0 start bd"
         elif code == 22:  # x16
             event = "DMA s2mm 1 start bd"
@@ -780,6 +816,28 @@ def lookup_event_name_by_type(trace_type, code):
             event = "DMA s2mm 1 stalled lock acquire"
         else:
             event = "Unknown"
+    # memtile traces
+    elif trace_type == 3:
+        if code == 0x1:
+            event = "True"
+        elif code == 80:  # 0x50
+            event = "PortRunning0"
+        elif code == 84:  # 0x54
+            event = "PortRunning1"
+        elif code == 88:  # 0x58
+            event = "PortRunning2"
+        elif code == 92:  # 0x5C
+            event = "PortRunning3"
+        elif code == 96:  # 0x60
+            event = "PortRunning4"
+        elif code == 100:  # 0x64
+            event = "PortRunning5"
+        elif code == 104:  # 0x68
+            event = "PortRunning6"
+        elif code == 108:  # 0x6C
+            event = "PortRunning7"
+        else:
+            event = "Unknown"
     else:
         event = "Unknown"
     return event
diff --git a/test/create-packet-flows/trace_packet_routing.mlir b/test/create-packet-flows/trace_packet_routing.mlir
new file mode 100644
index 0000000000..4353244417
--- /dev/null
+++ b/test/create-packet-flows/trace_packet_routing.mlir
@@ -0,0 +1,28 @@
+//===- trace_packet_routing.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: ryzen_ai, chess
+
+// RUN: aie-opt --aie-create-packet-flows %s | FileCheck %s
+// CHECK-LABEL: module @trace_packet_routing {
+  
+module @trace_packet_routing {
+ aie.device(npu1_4col) {
+  %tile_0_0 = aie.tile(0, 0)
+  %tile_1_0 = aie.tile(1, 0)
+  %tile_0_2 = aie.tile(0, 2)
+  %tile_0_3 = aie.tile(0, 3)
+
+  aie.packet_flow(0) { 
+    aie.packet_source<%tile_0_2, Trace : 0> // core trace
+    aie.packet_dest<%tile_0_0, DMA : 1>
+  } {keep_pkt_header = true}
+  aie.packet_flow(1) { 
+    aie.packet_source<%tile_0_3, Trace : 0> // core trace
+    aie.packet_dest<%tile_1_0, DMA : 1>
+  } {keep_pkt_header = true}
+ }
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/README.md b/test/npu-xrt/matrix_multiplication_using_cascade/README.md
new file mode 100644
index 0000000000..130362794f
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/README.md
@@ -0,0 +1,38 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+## MM Cascade Design Example
+This is a matrix multiply example with the sizes of (16 * 16) * (16 * 16) and i32 data type, where four different versions are compared to examine the possibility of distributing K dim accross multiple cores.
+
+### Plainx1 Version<br>
+Generated from IREE end-to-end flow, using one core only.
+
+### Plainx4 Version<br>
+Using four cores, as output stationary 
+
+### Bufferx4 Version<br>
+With four cores chained horizontally, the intermediate accumulations are passed through shared buffers implemented as ObjectFIFO.
+
+### Cascadex4 Version<br>
+Still having four cores but the intermediate accumulations are communicated through the cascade port.
+
+### Results<br>
+From the trace files, 
+
+|           | Total  | Init  | Compute |
+|-----------|--------|-------|---------|
+| Plainx1   | 25.6us | 7.6us | 18.0us  |
+| Plainx4   | 6.7us  | 2.0us | 4.7us   |
+| Bufferx4  | 32.0us | 7.6us | 24.4us  |
+| Cascadex4 | 13.9us | 7.6us | 6.3us   |
+
+The Buffer version is slow because of frequent lock-related operations.
+
+The Cascade version almost halves the latency but with 4x cores. The performance gain is constrained by the initialization time of the accumulation buffer (depends on MxN only).
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
new file mode 100644
index 0000000000..3112c0c05e
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
@@ -0,0 +1,648 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_4col) {
+    // <trace>
+    func.func private @flush_trace()
+    func.func private @event_0()
+    func.func private @event_1()
+    // </trace>
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_1_1 = aie.tile(1, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_1_2 = aie.tile(1, 2)
+    %tile_2_2 = aie.tile(2, 2)
+    %tile_3_2 = aie.tile(3, 2)
+    // <trace>
+    %tile_1_0 = aie.tile(1, 0)
+    %tile_2_0 = aie.tile(2, 0)
+    %tile_3_0 = aie.tile(3, 0)
+    // </trace>
+    %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%tile_0_1, 3) {init = 4 : i32}
+    %lock_0_1_1 = aie.lock(%tile_0_1, 2) {init = 0 : i32}
+    %lock_0_1_2 = aie.lock(%tile_0_1, 1) {init = 1 : i32}
+    %lock_0_1_3 = aie.lock(%tile_0_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_1_2 = aie.lock(%tile_1_2, 3) {init = 1 : i32}
+    %lock_1_2_7 = aie.lock(%tile_1_2, 2) {init = 0 : i32}
+    %lock_1_2_8 = aie.lock(%tile_1_2, 1) {init = 1 : i32}
+    %lock_1_2_9 = aie.lock(%tile_1_2, 0) {init = 0 : i32}
+    %lock_2_2 = aie.lock(%tile_2_2, 3) {init = 1 : i32}
+    %lock_2_2_10 = aie.lock(%tile_2_2, 2) {init = 0 : i32}
+    %lock_2_2_11 = aie.lock(%tile_2_2, 1) {init = 1 : i32}
+    %lock_2_2_12 = aie.lock(%tile_2_2, 0) {init = 0 : i32}
+    %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32}
+    %lock_3_2_13 = aie.lock(%tile_3_2, 4) {init = 0 : i32}
+    %lock_3_2_14 = aie.lock(%tile_3_2, 3) {init = 1 : i32}
+    %lock_3_2_15 = aie.lock(%tile_3_2, 2) {init = 0 : i32}
+    %lock_3_2_16 = aie.lock(%tile_3_2, 1) {init = 1 : i32}
+    %lock_3_2_17 = aie.lock(%tile_3_2, 0) {init = 0 : i32}
+    %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> 
+    %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> 
+    %buf12 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> 
+    %buf11 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf10 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf9 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf8 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf7 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf6 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf5 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf4 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf3 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> 
+    aie.objectfifo @of0 (%tile_0_2, {%tile_1_2}, 1 : i32) : !aie.objectfifo<memref<1xi32>>
+    aie.objectfifo @of1 (%tile_1_2, {%tile_2_2}, 1 : i32) : !aie.objectfifo<memref<1xi32>>
+    aie.objectfifo @of2 (%tile_2_2, {%tile_3_2}, 1 : i32) : !aie.objectfifo<memref<1xi32>>
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c1 step %c1 {
+            %subview = memref.subview %buf2[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_8 = memref.subview %buf1[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_9 = memref.subview %buf0[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            // <trace>
+            func.call @event_0() : () -> ()
+            // </trace>
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              scf.for %arg4 = %c0 to %c4 step %c1 {
+                scf.for %arg5 = %c0 to %c4 step %c1 {
+                  %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+                  %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %3 = arith.muli %0, %1 : i32
+                  %4 = arith.addi %2, %3 : i32
+                  memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                }
+                %cond0 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond0 {
+                  %5 = aie.objectfifo.acquire @of0 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %7 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  memref.store %7, %6[%c0] : memref<1xi32>
+                  aie.objectfifo.release @of0 (Produce, 1)
+                }
+              }
+            }
+            // <trace>
+            func.call @event_1() : () -> ()
+            // </trace>
+          }
+        }
+      }
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_0_2.elf",link_with = "mm.o"}
+    %mem_1_2 = aie.mem(%tile_1_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_8, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_1_2_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_1_2_7, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_1_2 = aie.core(%tile_1_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_1_2_7, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c1 step %c1 {
+            %subview = memref.subview %buf5[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_8 = memref.subview %buf4[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_9 = memref.subview %buf3[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            // <trace>
+            func.call @event_0() : () -> ()
+            // </trace>
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              scf.for %arg4 = %c0 to %c4 step %c1 {
+                %cond0 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond0 {
+                  %5 = aie.objectfifo.acquire @of0 (Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %7 = memref.load %6[%c0] : memref<1xi32>
+                  memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  aie.objectfifo.release @of0 (Consume, 1)
+                }
+                scf.for %arg5 = %c0 to %c4 step %c1 {
+                  %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+                  %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %3 = arith.muli %0, %1 : i32
+                  %4 = arith.addi %2, %3 : i32
+                  memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                }
+                %cond1 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond1 {
+                  %8 = aie.objectfifo.acquire @of1 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %10 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  memref.store %10, %9[%c0] : memref<1xi32>
+                  aie.objectfifo.release @of1 (Produce, 1)
+                }
+              }
+            }
+            // <trace>
+            func.call @event_1() : () -> ()
+            // </trace>
+          }
+        }
+      }
+      aie.use_lock(%lock_1_2_8, Release, 1)
+      aie.use_lock(%lock_1_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_1_2.elf",link_with = "mm.o"}
+    %mem_2_2 = aie.mem(%tile_2_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_11, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_2_2_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_2_2_10, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_2_2 = aie.core(%tile_2_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_2_2_10, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c1 step %c1 {
+            %subview = memref.subview %buf8[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_8 = memref.subview %buf7[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_9 = memref.subview %buf6[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            // <trace>
+            func.call @event_0() : () -> ()
+            // </trace>
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              scf.for %arg4 = %c0 to %c4 step %c1 {
+                %cond0 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond0 {
+                  %5 = aie.objectfifo.acquire @of1 (Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %7 = memref.load %6[%c0] : memref<1xi32>
+                  memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  aie.objectfifo.release @of1 (Consume, 1)
+                }
+                scf.for %arg5 = %c0 to %c4 step %c1 {
+                  %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+                  %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %3 = arith.muli %0, %1 : i32
+                  %4 = arith.addi %2, %3 : i32
+                  memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                }
+                %cond1 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond1 {
+                  %8 = aie.objectfifo.acquire @of2 (Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %10 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  memref.store %10, %9[%c0] : memref<1xi32>
+                  aie.objectfifo.release @of2 (Produce, 1)
+                }
+              }
+            }
+            // <trace>
+            func.call @event_1() : () -> ()
+            // </trace>
+          }
+        }
+      }
+      aie.use_lock(%lock_2_2_11, Release, 1)
+      aie.use_lock(%lock_2_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_2_2.elf",link_with = "mm.o"}
+    %mem_3_2 = aie.mem(%tile_3_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_14, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf11 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_3_2_15, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf10 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_3_2_13, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_3_2_17, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [<size = 16, stride = 4>, <size = 4, stride = 64>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_3_2_16, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_3_2 = aie.core(%tile_3_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_16, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_15, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_13, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c1 step %c1 {
+            %subview = memref.subview %buf11[%arg2, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_8 = memref.subview %buf10[%arg1, %arg2, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x1x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+            %subview_9 = memref.subview %buf9[%arg1, %arg0, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<4x4x4x4xi32, 2 : i32> to memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+            // <trace>
+            func.call @event_0() : () -> ()
+            // </trace>
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              scf.for %arg4 = %c0 to %c4 step %c1 {
+                %cond0 = arith.cmpi "eq", %arg2, %c0 : index
+                scf.if %cond0 {
+                  %5 = aie.objectfifo.acquire @of2 (Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
+                  %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
+                  %7 = memref.load %6[%c0] : memref<1xi32>
+                  memref.store %7, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  aie.objectfifo.release @of2 (Consume, 1)
+                }
+                scf.for %arg5 = %c0 to %c4 step %c1 {
+                  %0 = memref.load %subview[%c0, %c0, %arg3, %arg5] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %1 = memref.load %subview_8[%c0, %c0, %arg5, %arg4] : memref<1x1x4x4xi32, strided<[16, 16, 4, 1], offset: ?>, 2 : i32>
+                  %2 = memref.load %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                  %3 = arith.muli %0, %1 : i32
+                  %4 = arith.addi %2, %3 : i32
+                  memref.store %4, %subview_9[%c0, %c0, %arg3, %arg4] : memref<1x1x4x4xi32, strided<[64, 16, 4, 1], offset: ?>, 2 : i32>
+                }
+              }
+            }
+            // <trace>
+            func.call @event_1() : () -> ()
+            // </trace>
+          }
+        }
+      }
+      aie.use_lock(%lock_3_2_17, Release, 1)
+      aie.use_lock(%lock_3_2_14, Release, 1)
+      aie.use_lock(%lock_3_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 2, %tile_2_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 3, %tile_3_2, DMA : 0)
+    aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 2, %tile_2_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 3, %tile_3_2, DMA : 1)
+    aie.flow(%tile_3_2, DMA : 0, %tile_0_1, DMA : 1)
+    aie.flow(%tile_0_1, DMA : 4, %tile_0_0, DMA : 0)
+    aie.cascade_flow(%tile_0_2, %tile_1_2)
+    aie.cascade_flow(%tile_1_2, %tile_2_2)
+    aie.cascade_flow(%tile_2_2, %tile_3_2)
+    // <trace>
+    aie.packet_flow(0) { 
+      aie.packet_source<%tile_0_2, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(1) { 
+      aie.packet_source<%tile_1_2, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(2) { 
+      aie.packet_source<%tile_2_2, Trace : 0> 
+      aie.packet_dest<%tile_2_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(3) { 
+      aie.packet_source<%tile_3_2, Trace : 0> 
+      aie.packet_dest<%tile_3_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(4) { 
+      aie.packet_source<%tile_0_1, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(5) { 
+      aie.packet_source<%tile_1_1, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    // </trace>
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb13, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end   
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_3, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb5
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb7
+      %3 = aie.dma_start(MM2S, 1, ^bb8, ^bb5, repeat_count = 1)
+    ^bb8:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 4, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb9
+      %4 = aie.dma_start(MM2S, 2, ^bb10, ^bb7, repeat_count = 1)
+    ^bb10:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 8, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb10
+    ^bb11:  // pred: ^bb0
+      %5 = aie.dma_start(MM2S, 3, ^bb12, ^bb9, repeat_count = 1)
+    ^bb12:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 12, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb12
+    ^bb13:  // pred: ^bb0
+      %6 = aie.dma_start(MM2S, 4, ^bb14, ^bb11, repeat_count = 1)
+    ^bb14:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_2, Release, 1)
+      aie.next_bd ^bb14
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_1_1_0, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb7
+      %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 64, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb9
+      %3 = aie.dma_start(MM2S, 2, ^bb8, ^bb5, repeat_count = 1)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 128, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb0
+      %4 = aie.dma_start(MM2S, 3, ^bb10, ^bb7, repeat_count = 1)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 192, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
+    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
+      // <trace>
+      aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3
+      aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2
+      aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1
+      aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} 
+      
+      aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0
+      aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 6735 : ui32} // events:0x00 00 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5
+      aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) 
+      aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4
+      aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run)
+      aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} 
+     
+      aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1)
+     
+      // </trace>
+      memref.assume_alignment %arg0, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg1, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg2, 64 : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+  } {sym_name = "segment_0"}
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
new file mode 100644
index 0000000000..fb58fa0fb0
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
@@ -0,0 +1,496 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_4col) {
+    func.func private @matmul_scalar_put_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    func.func private @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    func.func private @matmul_scalar_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    // <trace>
+    func.func private @event_0()
+    func.func private @event_1()
+    func.func private @flush_trace()
+    // </trace>
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_1_1 = aie.tile(1, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_1_2 = aie.tile(1, 2)
+    %tile_2_2 = aie.tile(2, 2)
+    %tile_3_2 = aie.tile(3, 2)
+    // <trace>
+    %tile_1_0 = aie.tile(1, 0)
+    %tile_2_0 = aie.tile(2, 0)
+    %tile_3_0 = aie.tile(3, 0)
+    // </trace>
+    %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%tile_0_1, 3) {init = 4 : i32}
+    %lock_0_1_1 = aie.lock(%tile_0_1, 2) {init = 0 : i32}
+    %lock_0_1_2 = aie.lock(%tile_0_1, 1) {init = 1 : i32}
+    %lock_0_1_3 = aie.lock(%tile_0_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_1_2 = aie.lock(%tile_1_2, 3) {init = 1 : i32}
+    %lock_1_2_7 = aie.lock(%tile_1_2, 2) {init = 0 : i32}
+    %lock_1_2_8 = aie.lock(%tile_1_2, 1) {init = 1 : i32}
+    %lock_1_2_9 = aie.lock(%tile_1_2, 0) {init = 0 : i32}
+    %lock_2_2 = aie.lock(%tile_2_2, 3) {init = 1 : i32}
+    %lock_2_2_10 = aie.lock(%tile_2_2, 2) {init = 0 : i32}
+    %lock_2_2_11 = aie.lock(%tile_2_2, 1) {init = 1 : i32}
+    %lock_2_2_12 = aie.lock(%tile_2_2, 0) {init = 0 : i32}
+    %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32}
+    %lock_3_2_13 = aie.lock(%tile_3_2, 4) {init = 0 : i32}
+    %lock_3_2_14 = aie.lock(%tile_3_2, 3) {init = 1 : i32}
+    %lock_3_2_15 = aie.lock(%tile_3_2, 2) {init = 0 : i32}
+    %lock_3_2_16 = aie.lock(%tile_3_2, 1) {init = 1 : i32}
+    %lock_3_2_17 = aie.lock(%tile_3_2, 0) {init = 0 : i32}
+    %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> 
+    %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> 
+    %buf12 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> 
+    %buf11 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf10 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf9 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf8 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf7 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf6 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf5 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf4 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf3 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<4x4x4x4xi32, 2 : i32> 
+    %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<1x4x4x4xi32, 2 : i32> 
+    %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x1x4x4xi32, 2 : i32> 
+    %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> 
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_put_4x1x4_4x4x4_i32_i32(%buf2, %buf1, %buf0) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_0_2.elf",link_with = "mm.o"}
+    %mem_1_2 = aie.mem(%tile_1_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_8, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_1_2_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_1_2_7, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_1_2 = aie.core(%tile_1_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_1_2_7, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(%buf5, %buf4, %buf3) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_1_2_8, Release, 1)
+      aie.use_lock(%lock_1_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_1_2.elf",link_with = "mm.o"}
+    %mem_2_2 = aie.mem(%tile_2_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_11, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_2_2_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_2_2_10, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_2_2 = aie.core(%tile_2_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_2_2_10, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(%buf8, %buf7, %buf6) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_2_2_11, Release, 1)
+      aie.use_lock(%lock_2_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_2_2.elf",link_with = "mm.o"}
+    %mem_3_2 = aie.mem(%tile_3_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_14, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf11 : memref<1x4x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_3_2_15, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf10 : memref<4x1x4x4xi32, 2 : i32>, 0, 64)
+      aie.use_lock(%lock_3_2_13, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_3_2_17, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [<size = 16, stride = 4>, <size = 4, stride = 64>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_3_2_16, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_3_2 = aie.core(%tile_3_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_16, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_15, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_13, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_get_4x1x4_4x4x4_i32_i32(%buf11, %buf10, %buf9) : (memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_3_2_17, Release, 1)
+      aie.use_lock(%lock_3_2_14, Release, 1)
+      aie.use_lock(%lock_3_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_1_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 2, %tile_2_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 3, %tile_3_2, DMA : 0)
+    aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 2, %tile_2_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 3, %tile_3_2, DMA : 1)
+    aie.flow(%tile_3_2, DMA : 0, %tile_0_1, DMA : 1)
+    aie.flow(%tile_0_1, DMA : 4, %tile_0_0, DMA : 0)
+    aie.cascade_flow(%tile_0_2, %tile_1_2)
+    aie.cascade_flow(%tile_1_2, %tile_2_2)
+    aie.cascade_flow(%tile_2_2, %tile_3_2)
+    // <trace>
+    aie.packet_flow(0) { 
+      aie.packet_source<%tile_0_2, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(1) { 
+      aie.packet_source<%tile_1_2, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(2) { 
+      aie.packet_source<%tile_2_2, Trace : 0> 
+      aie.packet_dest<%tile_2_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(3) { 
+      aie.packet_source<%tile_3_2, Trace : 0> 
+      aie.packet_dest<%tile_3_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(4) { 
+      aie.packet_source<%tile_0_1, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(5) { 
+      aie.packet_source<%tile_1_1, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    // </trace>
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb13, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end   
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_3, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb5
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb7
+      %3 = aie.dma_start(MM2S, 1, ^bb8, ^bb5, repeat_count = 1)
+    ^bb8:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 4, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb9
+      %4 = aie.dma_start(MM2S, 2, ^bb10, ^bb7, repeat_count = 1)
+    ^bb10:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 8, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb10
+    ^bb11:  // pred: ^bb0
+      %5 = aie.dma_start(MM2S, 3, ^bb12, ^bb9, repeat_count = 1)
+    ^bb12:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 12, 64, [<size = 4, stride = 64>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb12
+    ^bb13:  // pred: ^bb0
+      %6 = aie.dma_start(MM2S, 4, ^bb14, ^bb11, repeat_count = 1)
+    ^bb14:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_2, Release, 1)
+      aie.next_bd ^bb14
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_1_1_0, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb7
+      %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 64, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb9
+      %3 = aie.dma_start(MM2S, 2, ^bb8, ^bb5, repeat_count = 1)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 128, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb0
+      %4 = aie.dma_start(MM2S, 3, ^bb10, ^bb7, repeat_count = 1)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 192, 64, [<size = 4, stride = 4>, <size = 4, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
+    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
+      // <trace>
+      aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3
+      aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2
+      aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1
+      aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} 
+      
+      aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0
+      aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5
+      aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) 
+      aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4
+      aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run)
+      aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} 
+     
+      aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1)
+     
+      // </trace>
+      memref.assume_alignment %arg0, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg1, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg2, 64 : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+  } {sym_name = "segment_0"}
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
new file mode 100644
index 0000000000..e6d4d7df97
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
@@ -0,0 +1,201 @@
+module {
+  aie.device(npu1_4col) {
+    func.func private @matmul_scalar_4x2x4_4x8x4_i32_i32(memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    // <trace>
+    func.func private @event_0()
+    func.func private @event_1()
+    func.func private @flush_trace()
+    // </trace>
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_1_1 = aie.tile(1, 1)
+    %tile_2_1 = aie.tile(2, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 1 : i32}
+    %lock_1_1_0 = aie.lock(%tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%tile_0_1, 1) {init = 1 : i32}
+    %lock_0_1_1 = aie.lock(%tile_0_1, 0) {init = 0 : i32}
+    %lock_2_1 = aie.lock(%tile_2_1, 1) {init = 1 : i32}
+    %lock_2_1_2 = aie.lock(%tile_2_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %buf5 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<16x16xi32, 1 : i32> 
+    %buf4 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<16x16xi32, 1 : i32> 
+    %buf3 = aie.buffer(%tile_2_1) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<16x16xi32, 1 : i32> 
+    %buf2 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x4x4x8xi32, 2 : i32> 
+    %buf1 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<4x2x8x4xi32, 2 : i32> 
+    %buf0 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<4x4x4x4xi32, 2 : i32> 
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<2x4x4x8xi32, 2 : i32>, 0, 256)
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<4x2x8x4xi32, 2 : i32>, 0, 256)
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<4x4x4x4xi32, 2 : i32>, 0, 256, [<size = 16, stride = 4>, <size = 4, stride = 64>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c4 step %c1 {
+        scf.for %arg1 = %c0 to %c4 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<4x4x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_4x2x4_4x8x4_i32_i32(%buf2, %buf1, %buf0) : (memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_0_2_7, Release, 1)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_0_2.elf", link_with = "mm.o"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
+    aie.flow(%tile_2_1, DMA : 0, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
+    aie.flow(%tile_0_2, DMA : 0, %tile_2_1, DMA : 0)
+    // <trace>
+    aie.packet_flow(0) { 
+      aie.packet_source<%tile_0_2, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(4) { 
+      aie.packet_source<%tile_0_1, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    // </trace>
+    %memtile_dma_2_1 = aie.memtile_dma(%tile_2_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_2_1, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<16x16xi32, 1 : i32>, 0, 256, [<size = 2, stride = 8>, <size = 16, stride = 16>, <size = 8, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_1_1_0, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<16x16xi32, 1 : i32>, 0, 256, [<size = 4, stride = 4>, <size = 16, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    }
+    aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
+    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
+      // <trace>
+      aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0
+      aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} 
+
+      aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4
+      aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run)
+      aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} 
+     
+      aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1)
+      // </trace>
+      memref.assume_alignment %arg0, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg1, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg2, 64 : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+  } {sym_name = "segment_0"}
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
new file mode 100644
index 0000000000..6ffc1cfda2
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
@@ -0,0 +1,532 @@
+module {
+  aie.device(npu1_4col) {
+    func.func private @matmul_scalar_2x2x2_4x8x4_i32_i32(memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>)
+    // <trace>
+    func.func private @event_0()
+    func.func private @event_1()
+    func.func private @flush_trace()
+    // </trace>
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_1_1 = aie.tile(1, 1)
+    %tile_2_1 = aie.tile(2, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_1_2 = aie.tile(1, 2)
+    %tile_2_2 = aie.tile(2, 2)
+    %tile_3_2 = aie.tile(3, 2)
+    // <trace>
+    %tile_1_0 = aie.tile(1, 0)
+    %tile_2_0 = aie.tile(2, 0)
+    %tile_3_0 = aie.tile(3, 0)
+    // </trace>
+    %lock_2_1 = aie.lock(%tile_2_1, 1) {init = 4 : i32}
+    %lock_2_1_0 = aie.lock(%tile_2_1, 0) {init = 0 : i32}
+    %lock_1_1 = aie.lock(%tile_1_1, 1) {init = 2 : i32}
+    %lock_1_1_1 = aie.lock(%tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%tile_0_1, 1) {init = 2 : i32}
+    %lock_0_1_2 = aie.lock(%tile_0_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_7 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_8 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_1_2 = aie.lock(%tile_1_2, 5) {init = 1 : i32}
+    %lock_1_2_9 = aie.lock(%tile_1_2, 4) {init = 0 : i32}
+    %lock_1_2_10 = aie.lock(%tile_1_2, 3) {init = 1 : i32}
+    %lock_1_2_11 = aie.lock(%tile_1_2, 2) {init = 0 : i32}
+    %lock_1_2_12 = aie.lock(%tile_1_2, 1) {init = 1 : i32}
+    %lock_1_2_13 = aie.lock(%tile_1_2, 0) {init = 0 : i32}
+    %lock_2_2 = aie.lock(%tile_2_2, 5) {init = 1 : i32}
+    %lock_2_2_14 = aie.lock(%tile_2_2, 4) {init = 0 : i32}
+    %lock_2_2_15 = aie.lock(%tile_2_2, 3) {init = 1 : i32}
+    %lock_2_2_16 = aie.lock(%tile_2_2, 2) {init = 0 : i32}
+    %lock_2_2_17 = aie.lock(%tile_2_2, 1) {init = 1 : i32}
+    %lock_2_2_18 = aie.lock(%tile_2_2, 0) {init = 0 : i32}
+    %lock_3_2 = aie.lock(%tile_3_2, 5) {init = 1 : i32}
+    %lock_3_2_19 = aie.lock(%tile_3_2, 4) {init = 0 : i32}
+    %lock_3_2_20 = aie.lock(%tile_3_2, 3) {init = 1 : i32}
+    %lock_3_2_21 = aie.lock(%tile_3_2, 2) {init = 0 : i32}
+    %lock_3_2_22 = aie.lock(%tile_3_2, 1) {init = 1 : i32}
+    %lock_3_2_23 = aie.lock(%tile_3_2, 0) {init = 0 : i32}
+    %buf14 = aie.buffer(%tile_0_1) {mem_bank = 0 : i32, sym_name = "buf14"} : memref<16x16xi32, 1 : i32> 
+    %buf13 = aie.buffer(%tile_1_1) {mem_bank = 0 : i32, sym_name = "buf13"} : memref<16x16xi32, 1 : i32> 
+    %buf12 = aie.buffer(%tile_2_1) {mem_bank = 0 : i32, sym_name = "buf12"} : memref<16x16xi32, 1 : i32> 
+    %buf11 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf11"} : memref<2x2x4x8xi32, 2 : i32> 
+    %buf10 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf10"} : memref<2x2x8x4xi32, 2 : i32> 
+    %buf9 = aie.buffer(%tile_0_2) {mem_bank = 0 : i32, sym_name = "buf9"} : memref<2x2x4x4xi32, 2 : i32> 
+    %buf8 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf8"} : memref<2x2x4x8xi32, 2 : i32> 
+    %buf7 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf7"} : memref<2x2x8x4xi32, 2 : i32> 
+    %buf6 = aie.buffer(%tile_1_2) {mem_bank = 0 : i32, sym_name = "buf6"} : memref<2x2x4x4xi32, 2 : i32> 
+    %buf5 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf5"} : memref<2x2x4x8xi32, 2 : i32> 
+    %buf4 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf4"} : memref<2x2x8x4xi32, 2 : i32> 
+    %buf3 = aie.buffer(%tile_2_2) {mem_bank = 0 : i32, sym_name = "buf3"} : memref<2x2x4x4xi32, 2 : i32> 
+    %buf2 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf2"} : memref<2x2x4x8xi32, 2 : i32> 
+    %buf1 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf1"} : memref<2x2x8x4xi32, 2 : i32> 
+    %buf0 = aie.buffer(%tile_3_2) {mem_bank = 0 : i32, sym_name = "buf0"} : memref<2x2x4x4xi32, 2 : i32>
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf11 : memref<2x2x4x8xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf10 : memref<2x2x8x4xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_2_8, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [<size = 8, stride = 4>, <size = 2, stride = 32>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_0_2_7, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c2 step %c1 {
+        scf.for %arg1 = %c0 to %c2 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf9[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf11, %buf10, %buf9) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_0_2_8, Release, 1)
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_0_2.elf", link_with = "mm.o"}
+    %mem_1_2 = aie.mem(%tile_1_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_10, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<2x2x4x8xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_1_2_11, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<2x2x8x4xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_1_2_9, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_2_13, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [<size = 8, stride = 4>, <size = 2, stride = 32>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_2_12, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_1_2 = aie.core(%tile_1_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_2_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_1_2_11, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_1_2_9, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c2 step %c1 {
+        scf.for %arg1 = %c0 to %c2 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf6[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf8, %buf7, %buf6) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_1_2_13, Release, 1)
+      aie.use_lock(%lock_1_2_10, Release, 1)
+      aie.use_lock(%lock_1_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_1_2.elf", link_with = "mm.o"}
+    %mem_2_2 = aie.mem(%tile_2_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_15, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<2x2x4x8xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_2_2_16, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<2x2x8x4xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_2_2_14, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_2_2_18, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [<size = 8, stride = 4>, <size = 2, stride = 32>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_2_2_17, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_2_2 = aie.core(%tile_2_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_2_17, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_2_2_16, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_2_2_14, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c2 step %c1 {
+        scf.for %arg1 = %c0 to %c2 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf3[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf5, %buf4, %buf3) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_2_2_18, Release, 1)
+      aie.use_lock(%lock_2_2_15, Release, 1)
+      aie.use_lock(%lock_2_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_2_2.elf", link_with = "mm.o"}
+    %mem_3_2 = aie.mem(%tile_3_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_20, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<2x2x4x8xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_3_2_21, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb5
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_3_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<2x2x8x4xi32, 2 : i32>, 0, 128)
+      aie.use_lock(%lock_3_2_19, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 0, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_3_2_23, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<2x2x4x4xi32, 2 : i32>, 0, 64, [<size = 8, stride = 4>, <size = 2, stride = 32>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_3_2_22, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_3_2 = aie.core(%tile_3_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c4 = arith.constant 4 : index
+      %c2 = arith.constant 2 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_3_2_22, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_21, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_3_2_19, AcquireGreaterEqual, 1)
+      // <trace>
+      func.call @event_0() : () -> ()
+      // </trace>
+      scf.for %arg0 = %c0 to %c2 step %c1 {
+        scf.for %arg1 = %c0 to %c2 step %c1 {
+          scf.for %arg2 = %c0 to %c4 step %c1 {
+            scf.for %arg3 = %c0 to %c4 step %c1 {
+              memref.store %c0_i32, %buf0[%arg0, %arg1, %arg2, %arg3] : memref<2x2x4x4xi32, 2 : i32>
+            }
+          }
+        }
+      }
+      // <trace>
+      func.call @event_1() : () -> ()
+      // </trace>
+      func.call @matmul_scalar_2x2x2_4x8x4_i32_i32(%buf2, %buf1, %buf0) : (memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) -> ()
+      aie.use_lock(%lock_3_2_23, Release, 1)
+      aie.use_lock(%lock_3_2_20, Release, 1)
+      aie.use_lock(%lock_3_2, Release, 1)
+      // <trace>
+      func.call @flush_trace() : () -> ()
+      // </trace>
+      cf.br ^bb1
+    } {elf_file = "segment_0_core_3_2.elf", link_with = "mm.o"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_1_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_2_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_3_2, DMA : 0)
+    aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 0, %tile_2_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 1, %tile_1_2, DMA : 1)
+    aie.flow(%tile_1_1, DMA : 1, %tile_3_2, DMA : 1)
+    aie.flow(%tile_0_2, DMA : 0, %tile_2_1, DMA : 0)
+    aie.flow(%tile_1_2, DMA : 0, %tile_2_1, DMA : 1)
+    aie.flow(%tile_2_2, DMA : 0, %tile_2_1, DMA : 2)
+    aie.flow(%tile_3_2, DMA : 0, %tile_2_1, DMA : 3)
+    aie.flow(%tile_2_1, DMA : 0, %tile_0_0, DMA : 0)
+    // <trace>
+    aie.packet_flow(0) { 
+      aie.packet_source<%tile_0_2, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(1) { 
+      aie.packet_source<%tile_1_2, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(2) { 
+      aie.packet_source<%tile_2_2, Trace : 0> 
+      aie.packet_dest<%tile_2_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(3) { 
+      aie.packet_source<%tile_3_2, Trace : 0> 
+      aie.packet_dest<%tile_3_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(4) { 
+      aie.packet_source<%tile_0_1, Trace : 0> 
+      aie.packet_dest<%tile_0_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(5) { 
+      aie.packet_source<%tile_1_1, Trace : 0> 
+      aie.packet_dest<%tile_1_0, DMA : 1>
+    } {keep_pkt_header = true}
+    aie.packet_flow(6) { 
+      aie.packet_source<%tile_2_1, Trace : 0> 
+      aie.packet_dest<%tile_2_0, DMA : 1>
+    } {keep_pkt_header = true}
+    // </trace>
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 2)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_0_1_2, Release, 2)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end  
+    ^bb3: // pred: ^bb5
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4: // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 0, 128, [<size = 2, stride = 8>, <size = 8, stride = 16>, <size = 8, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<16x16xi32, 1 : i32>, 128, 128, [<size = 2, stride = 8>, <size = 8, stride = 16>, <size = 8, stride = 1>])
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 2)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 256)
+      aie.use_lock(%lock_1_1_1, Release, 2)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end  
+    ^bb3: // pred: ^bb5
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb2, repeat_count = 1)
+    ^bb4: // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 0, 128, [<size = 2, stride = 4>, <size = 16, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb0
+      %2 = aie.dma_start(MM2S, 1, ^bb6, ^bb3, repeat_count = 1)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<16x16xi32, 1 : i32>, 8, 128, [<size = 2, stride = 4>, <size = 16, stride = 16>, <size = 4, stride = 1>])
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %memtile_dma_2_1 = aie.memtile_dma(%tile_2_1) {
+        %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb9, repeat_count = 1)
+      ^bb1:  // 2 preds: ^bb0, ^bb1
+        aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+        aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 64, [<size = 8, stride = 16>, <size = 8, stride = 1>])
+        aie.use_lock(%lock_2_1_0, Release, 1)
+        aie.next_bd ^bb1
+      ^bb2:  // pred: ^bb3
+        aie.end  
+      ^bb3: // pred: ^bb5
+        %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb2, repeat_count = 1)
+      ^bb4: // 2 preds: ^bb3, ^bb4
+        aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+        aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 8, 64, [<size = 8, stride = 16>, <size = 8, stride = 1>])
+        aie.use_lock(%lock_2_1_0, Release, 1)
+        aie.next_bd ^bb4
+      ^bb5: // pred: ^bb7
+        %2 = aie.dma_start(S2MM, 2, ^bb6, ^bb3, repeat_count = 1)
+      ^bb6: // 2 preds: ^bb5, ^bb6
+        aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+        aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 128, 64, [<size = 8, stride = 16>, <size = 8, stride = 1>])
+        aie.use_lock(%lock_2_1_0, Release, 1)
+        aie.next_bd ^bb6
+      ^bb7: // pred: ^bb9
+        %3 = aie.dma_start(S2MM, 3, ^bb8, ^bb6, repeat_count = 1)
+      ^bb8: // 2 preds: ^bb7, ^bb8
+        aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+        aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 136, 64, [<size = 8, stride = 16>, <size = 8, stride = 1>])
+        aie.use_lock(%lock_2_1_0, Release, 1)
+        aie.next_bd ^bb8
+      ^bb9:  // pred: ^bb0
+        %4 = aie.dma_start(MM2S, 0, ^bb10, ^bb7, repeat_count = 1)
+      ^bb10:  // 2 preds: ^bb9, ^bb10
+        aie.use_lock(%lock_2_1_0, AcquireGreaterEqual, 4)
+        aie.dma_bd(%buf12 : memref<16x16xi32, 1 : i32>, 0, 256)
+        aie.use_lock(%lock_2_1, Release, 4)
+        aie.next_bd ^bb10
+    } 
+    aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0)
+    memref.global "public" @airMemcpyId12 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId4(MM2S, 0, 0)
+    memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
+    aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
+    memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
+    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
+      // <trace>
+      aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 3 : i32, row = 2 : i32, value = 3 : ui32} // packet_type: 0(core), packet_id: 3
+      aiex.npu.write32 {address = 213216 : ui32, column = 3 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 2 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 2 : i32, row = 2 : i32, value = 2 : ui32} // packet_type: 0(core), packet_id: 2
+      aiex.npu.write32 {address = 213216 : ui32, column = 2 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} 
+
+      aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
+      aiex.npu.write32 {address = 213200 : ui32, column = 1 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 1 : i32, row = 2 : i32, value = 1 : ui32} // packet_type: 0(core), packet_id: 1
+      aiex.npu.write32 {address = 213216 : ui32, column = 1 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} 
+      
+      aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
+      aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} // packet_type: 0(core), packet_id: 0
+      aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527873 : ui32} // events: 0x4B(port0 run) 22(event1) 21(event0) 01(true)
+      aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run)
+      aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1
+      aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+      aiex.npu.writebd_shimtile {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 2 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 2 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 2 : i32, row = 1 : i32, value = 12294 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 6
+      aiex.npu.write32 {address = 606432 : ui32, column = 2 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 2 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) 
+      aiex.npu.write32 {address = 724736 : ui32, column = 2 : i32, row = 1 : i32, value = 589439264 : ui32} // [29:24] port3 S2MM-3, [21:16] port2 S2MM-2, [13:8] port1 S2MM-1, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 2 : i32, row = 1 : i32, value = 0 : ui32} // [5:0] port4 MM2S-0
+      aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 11 : ui32} 
+
+      aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 1 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 1 : i32, row = 1 : i32, value = 12293 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 5
+      aiex.npu.write32 {address = 606432 : ui32, column = 1 : i32, row = 1 : i32, value = 336 : ui32} // events: 0x00 00 01(true) 50(port0 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) 
+      aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 10 : ui32} 
+      
+      aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606416 : ui32, column = 0 : i32, row = 1 : i32, value = 10289152 : ui32} // [23:16] start event: 157(BROADCAST_15)
+      aiex.npu.write32 {address = 606420 : ui32, column = 0 : i32, row = 1 : i32, value = 12292 : ui32} // [14:12] packet_type: 3(mem_tile), [4:0] packet_id: 4
+      aiex.npu.write32 {address = 606432 : ui32, column = 0 : i32, row = 1 : i32, value = 760239192 : ui32} // events: 0x2D(lock release) 50(port0 run) 0x54(port1 run) 58(port2 run)
+      aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run)
+      aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0
+      aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3
+      aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 9 : ui32} 
+     
+      aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213068: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // [6:0] broadcast 15: 127(USER_EVENT_1)
+      aiex.npu.write32 {address = 213000: ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} // event generate [6:0]: 127(USER_EVENT_1)
+     
+      // </trace>
+      memref.assume_alignment %arg0, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg1, 64 : memref<16x16xi32>
+      memref.assume_alignment %arg2, 64 : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      return
+    }
+  } {sym_name = "segment_0"}
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h b/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h
new file mode 100644
index 0000000000..9dbe9bd203
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/matrix_multiplication.h
@@ -0,0 +1,316 @@
+//===- matrix_multiplication.h ----------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// This file contains common helper functions for the matrix multiplication
+// host code, such as verifying and printing matrices.
+
+#ifndef MATRIX_MULTIPLICATION_H
+#define MATRIX_MULTIPLICATION_H
+
+#include <boost/program_options.hpp>
+#include <cmath>
+
+namespace matmul_common {
+
+namespace po = boost::program_options;
+
+// --------------------------------------------------------------------------
+// Command Line Argument Handling
+// --------------------------------------------------------------------------
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+void add_default_options(po::options_description &desc) {
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions sent to the NPU")(
+      "verify", po::value<bool>()->default_value(true),
+      "whether to verify the AIE computed output")(
+      "iters", po::value<int>()->default_value(1))(
+      "warmup", po::value<int>()->default_value(0))(
+      "trace_sz,t", po::value<int>()->default_value(0))(
+      "trace_file", po::value<std::string>()->default_value("trace.txt"),
+      "where to store trace output");
+}
+
+void parse_options(int argc, const char *argv[], po::options_description &desc,
+                   po::variables_map &vm) {
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      std::exit(1);
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    std::exit(1);
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+}
+
+// --------------------------------------------------------------------------
+// AIE Specifics
+// --------------------------------------------------------------------------
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+// --------------------------------------------------------------------------
+// Matrix / Float / Math
+// --------------------------------------------------------------------------
+
+static inline std::int16_t random_int16_t() {
+  return (std::int16_t)rand() % 0x10000;
+}
+
+static inline std::int32_t random_int32_t() {
+  return (std::int32_t)rand() % 0x10000;
+}
+
+static inline std::bfloat16_t random_bfloat16_t() {
+  // Random numbers should NOT be uniformly between 0 and 1, because that
+  // would make the matrix product AB always close to 1.
+  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
+}
+
+template <typename Tin, typename Tout>
+void matmul_naive(int M, int N, int K, const std::vector<Tin> A,
+                  const std::vector<Tin> B, std::vector<Tout> &C) {
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      Tout running_sum = 0;
+      for (int k = 0; k < K; k++) {
+        running_sum += Tout(A[row * K + k] * B[k * N + col]);
+      }
+      C[row * N + col] = Tout(running_sum);
+    }
+  }
+}
+
+const int K_block_size = 64;
+
+template <typename Tin, typename Tout>
+void matmul(int M, int N, int K, const std::vector<Tin> A,
+            const std::vector<Tin> B, std::vector<Tout> &C) {
+  // A is an  MxK matrix
+  // B is a   KxN matrix
+  // C is the MxN output matrix, assumed to be zeroed out
+
+  const int n_K_blocks = K / K_block_size;
+  assert(K % K_block_size == 0 && "K must be divisible by K_block_size");
+
+  const Tin *B_origin = B.data(); /* Avoid a calls to B.data() within the loop
+                                     with this const variable. B does not get
+                                     resized, so the pointer remains valid. */
+
+  const Tin *A_base = A.data(); /* Points to start of current row of A,
+                                   monotonically increasing by K. */
+  const Tin *B_base = B_origin; /* Points to start of current column of B;
+                                   increases by 1 in each inner loop, resets
+                                   to B_origin (0) at the start of a new row
+                                   (outer loop). */
+
+  const Tin *A_ptr = A_base;
+  const Tin *B_ptr = B_base;
+  Tout *C_ptr = C.data(); /* Monotonically increasing by 1. */
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      A_ptr = A_base;
+      B_ptr = B_base;
+      Tout running_sum = 0;
+      for (int k = 0; k < n_K_blocks; k++) {
+        for (int i = 0; i < K_block_size; i++) {
+          running_sum += Tout(*A_ptr) * Tout(*B_ptr);
+          A_ptr += 1; // Advance to right neighbor; next value in this row
+          B_ptr += N; // Advance to bottom neighbor; next value in this column
+        }
+      }
+      *C_ptr = Tout(running_sum);
+      C_ptr += 1;
+      B_base += 1; /* Next iteration: same row of A (A_base unchanged),
+                      next column of B (B_base increases by 1) */
+    }
+    A_base += K;       // Advance to next row of A
+    B_base = B_origin; /* Next row of A means we need to restart at the first
+                          column of B. */
+  }
+}
+
+// nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
+// Original author: P-Gn
+// Source: https://stackoverflow.com/a/32334103
+bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
+                  float abs_th = FLT_MIN)
+// those defaults are arbitrary and could be removed
+{
+  assert(std::numeric_limits<float>::epsilon() <= epsilon);
+  assert(epsilon < 1.f);
+
+  if (a == b)
+    return true;
+
+  auto diff = std::abs(a - b);
+  auto norm =
+      std::min((std::abs(a) + std::abs(b)), std::numeric_limits<float>::max());
+  // or even faster: std::min(std::abs(a + b),
+  // std::numeric_limits<float>::max()); keeping this commented out until I
+  // update figures below
+  return diff < std::max(abs_th, epsilon * norm);
+}
+
+template <typename T>
+void print_matrix(const std::vector<T> matrix, int n_cols,
+                  int n_printable_rows = 10, int n_printable_cols = 10,
+                  std::ostream &ostream = std::cout,
+                  const char col_sep[] = "  ", const char elide_sym[] = " ... ",
+                  int w = -1) {
+  assert(matrix.size() % n_cols == 0);
+
+  auto maxima = std::minmax_element(matrix.begin(), matrix.end());
+  T max_val = std::max(*maxima.first, std::abs(*maxima.second));
+  size_t n_digits = log10(max_val);
+  if (w == -1) {
+    w = n_digits;
+  }
+  int n_rows = matrix.size() / n_cols;
+
+  n_printable_rows = std::min(n_rows, n_printable_rows);
+  n_printable_cols = std::min(n_cols, n_printable_cols);
+
+  const bool elide_rows = n_printable_rows < n_rows;
+  const bool elide_cols = n_printable_cols < n_cols;
+
+  if (elide_rows || elide_cols) {
+    w = std::max((int)w, (int)strlen(elide_sym));
+  }
+
+  w += 3; // for decimal point and two decimal digits
+  ostream << std::fixed << std::setprecision(2);
+
+#define print_row(what)                                                        \
+  for (int col = 0; col < n_printable_cols / 2; col++) {                       \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }                                                                            \
+  if (elide_cols) {                                                            \
+    ostream << std::setw(0) << elide_sym;                                      \
+  }                                                                            \
+  for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) {    \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }
+
+  for (int row = 0; row < n_printable_rows / 2; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+  if (elide_rows) {
+    print_row(elide_sym);
+    ostream << std::endl;
+  }
+  for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+
+#undef print_row
+}
+
+template <typename Tin, typename Tout>
+int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
+           std::vector<Tout> C) {
+  int errors = 0;
+  int max_printable_errors = 500;
+  const float absTol = 0.5;
+  const float relTol = 0.5;
+
+  std::vector<Tout> CRef(M * N);
+  if (K % K_block_size == 0) {
+    matmul(M, N, K, A, B, CRef);
+  } else {
+    matmul_naive(M, N, K, A, B, CRef);
+  }
+
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < N; col++) {
+      if (!nearly_equal(CRef[row * N + col], C[row * N + col], relTol,
+                        absTol)) {
+        errors++;
+        if (errors < max_printable_errors) {
+          std::cout << "Error in row " << row << ", col " << col << ". "
+                    << "Expected " << std::setw(4) << (float)CRef[row * N + col]
+                    << ", got " << std::setw(4) << (float)C[row * N + col]
+                    << "." << std::endl;
+        }
+      }
+    }
+  }
+
+  if (errors >= max_printable_errors) {
+    std::cout << "...and " << std::setw(0) << errors << " further errors."
+              << std::endl;
+  }
+  if (errors > 0) {
+    std::cout << std::endl << "Reference:" << std::endl;
+    matmul_common::print_matrix(CRef, N);
+    std::cout << std::endl << "Output:" << std::endl;
+    matmul_common::print_matrix(C, N);
+  }
+
+  return errors;
+}
+
+void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
+  std::ofstream fout(path);
+  uint32_t *traceOut = (uint32_t *)traceOutPtr;
+  for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
+    fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
+    fout << std::endl;
+  }
+}
+
+} // namespace matmul_common
+
+#endif
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc b/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc
new file mode 100644
index 0000000000..8a19bcc250
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/mm.cc
@@ -0,0 +1,89 @@
+//===- mm.cc ----------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+template <bool put, bool get, int M_tile, int K_tile, int N_tile, int M, int K,
+          int N>
+void matmul_scalar_cascade_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  for (int m_t = 0; m_t < M_tile; m_t++) {
+    for (int n_t = 0; n_t < N_tile; n_t++) {
+      for (int k_t = 0; k_t < K_tile; k_t++) {
+        event0();
+        int a_offset = (k_t * M_tile + m_t) * (M * K);
+        int b_offset = (n_t * K_tile + k_t) * (K * N);
+        int c_offset = (n_t * M_tile + m_t) * (M * N);
+        for (int m = 0; m < M; m++) {
+          for (int n = 0; n < N; n++) {
+            int32_t running_sum = 0;
+            if (get && k_t == 0) {
+              v32int32 v32 = get_scd_v32int32();
+              running_sum += ext_elem(v32, 0);
+            }
+            for (int k = 0; k < K; k++) {
+              running_sum += a[a_offset + m * K + k] * b[b_offset + k * N + n];
+            }
+            c[c_offset + m * N + n] += running_sum;
+            if (put && k_t == K_tile - 1) {
+              v32int32 v32 = undef_v32int32();
+              v32 = upd_elem(v32, 0, c[c_offset + m * N + n]);
+              put_mcd(v32);
+            }
+          }
+        }
+        event1();
+      }
+    }
+  }
+}
+
+extern "C" {
+
+void matmul_scalar_put_4x1x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<true, false, 4, 1, 4, 4, 8, 4>(a, b, c);
+}
+void matmul_scalar_get_4x1x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<false, true, 4, 1, 4, 4, 8, 4>(a, b, c);
+}
+void matmul_scalar_put_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<true, false, 4, 1, 4, 4, 4, 4>(a, b, c);
+}
+void matmul_scalar_get_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<false, true, 4, 1, 4, 4, 4, 4>(a, b, c);
+}
+void matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(int32_t *a, int32_t *b,
+                                               int32_t *c) {
+  matmul_scalar_cascade_i32_i32<true, true, 4, 1, 4, 4, 4, 4>(a, b, c);
+}
+void matmul_scalar_4x2x4_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<false, false, 4, 2, 4, 4, 8, 4>(a, b, c);
+}
+void matmul_scalar_2x2x2_4x8x4_i32_i32(int32_t *a, int32_t *b, int32_t *c) {
+  matmul_scalar_cascade_i32_i32<false, false, 2, 2, 2, 4, 8, 4>(a, b, c);
+}
+
+void flush_trace() {
+  // event buffers only appear to be transferred to DDR in bursts of 256 bytes
+  // (64 events)
+  for (int i = 0; i < 32; i++) {
+    event0();
+    event1();
+  }
+}
+
+void event_0() { event0(); }
+void event_1() { event1(); }
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/run.lit b/test/npu-xrt/matrix_multiplication_using_cascade/run.lit
new file mode 100644
index 0000000000..03d7c862c5
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/run.lit
@@ -0,0 +1,19 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o
+// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+//
+// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_plain.xclbin --npu-insts-name=insts2_plain.txt %S/aie_plainx4.mlir
+// RUN: %run_on_npu ./test.exe -x aie2_plain.xclbin -k MLIR_AIE -i insts2_plain.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_PLAIN %s
+// CHECK_PLAIN: PASS!
+//
+// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_buffer.xclbin --npu-insts-name=insts2_buffer.txt %S/aie_bufferx4.mlir
+// RUN: %run_on_npu ./test.exe -x aie2_buffer.xclbin -k MLIR_AIE -i insts2_buffer.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_BUFFER %s
+// CHECK_BUFFER: PASS!
+//
+// RUN: %python aiecc.py --xchesscc --xbridge --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie2_cascade.xclbin --npu-insts-name=insts2_cascade.txt %S/aie_cascadex4.mlir
+// RUN: %run_on_npu ./test.exe -x aie2_cascade.xclbin -k MLIR_AIE -i insts2_cascade.txt --trace_sz 32768 | FileCheck --check-prefix=CHECK_CASCADE %s
+// CHECK_CASCADE: PASS!
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
new file mode 100644
index 0000000000..3b26e0623a
--- /dev/null
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
@@ -0,0 +1,231 @@
+
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdfloat>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "matrix_multiplication.h"
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+constexpr int A_VOLUME = M * K;
+constexpr int B_VOLUME = N * K;
+constexpr int C_VOLUME = M * N;
+
+using A_DATATYPE = std::int32_t;
+using B_DATATYPE = std::int32_t;
+using C_DATATYPE = std::int32_t;
+
+constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE));
+constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE));
+constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE));
+
+constexpr bool VERIFY = true;
+
+namespace po = boost::program_options;
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  matmul_common::add_default_options(desc);
+  matmul_common::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  srand(time(NULL));
+
+  std::vector<uint32_t> instr_v =
+      matmul_common::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_a =
+      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_b =
+      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_c = xrt::bo(device, C_SIZE + trace_size, XRT_BO_FLAGS_HOST_ONLY,
+                      kernel.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  A_DATATYPE *bufA = bo_a.map<A_DATATYPE *>();
+  std::vector<A_DATATYPE> AVec(A_VOLUME);
+  for (int i = 0; i < A_VOLUME; i++) {
+    AVec[i] = matmul_common::random_int32_t();
+  }
+  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE)));
+  B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();
+  std::vector<B_DATATYPE> BVec(B_VOLUME);
+  for (int i = 0; i < B_VOLUME; i++) {
+    BVec[i] = matmul_common::random_int32_t();
+  }
+  memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE)));
+  C_DATATYPE *bufC = bo_c.map<C_DATATYPE *>();
+  std::vector<C_DATATYPE> CVec(C_VOLUME);
+  // memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE)));
+  memset(bufC, 0, C_SIZE + trace_size);
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned num_iter = 1;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+  float macs = 2.0 * float(M) * float(K) * float(N);
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    memcpy(CVec.data(), bufC, (CVec.size() * sizeof(C_DATATYPE)));
+    // std::vector<C_DATATYPE> CVecRef(C_VOLUME);
+    if (VERIFY) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying against reference matmul ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      // matmul_common::matmul_naive(M, N, K, AVec, BVec, CVecRef);
+      errors = matmul_common::verify(M, N, K, AVec, BVec, CVec);
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: matmul results not verified." << std::endl;
+    }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    if (trace_size > 0) {
+      matmul_common::write_out_trace(((char *)bufC) + C_SIZE, trace_size,
+                                     vm["trace_file"].as<std::string>());
+    }
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  std::cout << std::endl
+            << "Avg NPU matmul time: " << npu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter)
+            << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
+  std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
+  std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl;
+
+  if (VERIFY && !errors) {
+    std::cout << "\nPASS!\n\n";
+
+    // Open the CSV file in appending mode
+    std::ofstream outfile;
+    outfile.open("../results.csv", std::ios_base::app);
+    // Write M, N, K, and avg runtime to the CSV file
+    outfile << M << "," << N << "," << K << "," << npu_time_total / num_iter
+            << "," << macs / (1000 * npu_time_total / num_iter) << std::endl;
+    // Close the CSV file
+    outfile.close();
+
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}