Matmul cascade (#1465)

Xilinx · Jun 5, 2024 · 67b26ff · 67b26ff
1 parent 3709f8e
commit 67b26ff
Show file tree

Hide file tree

Showing 12 changed files with 2,681 additions and 15 deletions.
diff --git a/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp b/lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp
@@ -157,10 +157,17 @@ void updateCoordinates(int &xCur, int &yCur, WireBundle move) {
 // Build a packet-switched route from the sourse to the destination with the
 // given ID. The route is recorded in the given map of switchboxes.
 void buildPSRoute(
-    int xSrc, int ySrc, Port sourcePort, int xDest, int yDest, Port destPort,
-    int flowID,
+    TileOp srcTile, Port sourcePort, TileOp destTile, Port destPort, int flowID,
     DenseMap<TileID, SmallVector<std::pair<Connect, int>, 8>> &switchboxes,
     bool reverseOrder = false) {
+
+  int xSrc = srcTile.colIndex();
+  int ySrc = srcTile.rowIndex();
+  int xDest = destTile.colIndex();
+  int yDest = destTile.rowIndex();
+
+  const auto &targetModel = getTargetModel(srcTile);
+
   int xCur = xSrc;
   int yCur = ySrc;
   WireBundle curBundle = {};
@@ -213,6 +220,13 @@ void buildPSRoute(
       if (move == lastBundle)
         continue;
 
+      // If the source port is a trace port, we need to validate the destination
+      if (xCur == xSrc && yCur == ySrc &&
+          sourcePort.bundle == WireBundle::Trace &&
+          !targetModel.isValidTraceMaster(xSrc, ySrc, move, curChannel)) {
+        continue;
+      }
+
       updateCoordinates(xCur, yCur, move);
 
       if (std::find(congestion.begin(), congestion.end(), TileID{xCur, yCur}) !=
@@ -320,22 +334,18 @@ struct AIERoutePacketFlowsPass
       Region &r = pktflow.getPorts();
       Block &b = r.front();
       int flowID = pktflow.IDInt();
-      int xSrc = 0, ySrc = 0;
-      Port sourcePort;
+      Port sourcePort, destPort;
+      TileOp srcTile, destTile;
 
       for (Operation &Op : b.getOperations()) {
         if (auto pktSource = dyn_cast<PacketSourceOp>(Op)) {
-          auto srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
-          xSrc = srcTile.colIndex();
-          ySrc = srcTile.rowIndex();
+          srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
           sourcePort = pktSource.port();
         } else if (auto pktDest = dyn_cast<PacketDestOp>(Op)) {
-          auto destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
-          int xDest = destTile.colIndex();
-          int yDest = destTile.rowIndex();
-          Port destPort = pktDest.port();
+          destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
+          destPort = pktDest.port();
 
-          buildPSRoute(xSrc, ySrc, sourcePort, xDest, yDest, destPort, flowID,
+          buildPSRoute(srcTile, sourcePort, destTile, destPort, flowID,
                        switchboxes, true);
 
           // Assign "keep_pkt_header flag"

diff --git a/programming_examples/utils/parse_trace.py b/programming_examples/utils/parse_trace.py
@@ -702,8 +702,42 @@ def parse_mlir_trace_events(lines):
                 pid_events[1][key][5] = (value >> 8) & 0xFF
                 pid_events[1][key][6] = (value >> 16) & 0xFF
                 pid_events[1][key][7] = (value >> 24) & 0xFF
-
-            # TODO intfc and memtile event 0, 1 needs to also be defined
+            # memtile event 0
+            elif address == 0x940E0:  # 606432
+                if pid_events[3].get(key) == None:
+                    pid_events[3][key] = [
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                    ]  # TODO no better way to init this?
+                # print("Trace event 0 configured to be ",hex(value))
+                pid_events[3][key][0] = value & 0xFF
+                pid_events[3][key][1] = (value >> 8) & 0xFF
+                pid_events[3][key][2] = (value >> 16) & 0xFF
+                pid_events[3][key][3] = (value >> 24) & 0xFF
+            # memtile event 1
+            elif address == 0x940E4:  # 606436
+                if pid_events[3].get(key) == None:
+                    pid_events[3][key] = [
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                    ]  # TODO no better way to init this?
+                pid_events[3][key][4] = value & 0xFF
+                pid_events[3][key][5] = (value >> 8) & 0xFF
+                pid_events[3][key][6] = (value >> 16) & 0xFF
+                pid_events[3][key][7] = (value >> 24) & 0xFF
+            # TODO intfc event 0, 1 needs to also be defined
 
     # print("Found labels:\n")
     # for j in pid_events:
@@ -750,7 +784,9 @@ def lookup_event_name_by_type(trace_type, code):
     # Mem traces
     elif trace_type == 1:
         # TODO Need to define these
-        if code == 21:  # x15
+        if code == 0x1:
+            event = "True"
+        elif code == 21:  # x15
             event = "DMA s2mm 0 start bd"
         elif code == 22:  # x16
             event = "DMA s2mm 1 start bd"
@@ -780,6 +816,28 @@ def lookup_event_name_by_type(trace_type, code):
             event = "DMA s2mm 1 stalled lock acquire"
         else:
             event = "Unknown"
+    # memtile traces
+    elif trace_type == 3:
+        if code == 0x1:
+            event = "True"
+        elif code == 80:  # 0x50
+            event = "PortRunning0"
+        elif code == 84:  # 0x54
+            event = "PortRunning1"
+        elif code == 88:  # 0x58
+            event = "PortRunning2"
+        elif code == 92:  # 0x5C
+            event = "PortRunning3"
+        elif code == 96:  # 0x60
+            event = "PortRunning4"
+        elif code == 100:  # 0x64
+            event = "PortRunning5"
+        elif code == 104:  # 0x68
+            event = "PortRunning6"
+        elif code == 108:  # 0x6C
+            event = "PortRunning7"
+        else:
+            event = "Unknown"
     else:
         event = "Unknown"
     return event

diff --git a/test/create-packet-flows/trace_packet_routing.mlir b/test/create-packet-flows/trace_packet_routing.mlir
@@ -0,0 +1,28 @@
+//===- trace_packet_routing.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: ryzen_ai, chess
+
+// RUN: aie-opt --aie-create-packet-flows %s | FileCheck %s
+// CHECK-LABEL: module @trace_packet_routing {
+
+module @trace_packet_routing {
+ aie.device(npu1_4col) {
+  %tile_0_0 = aie.tile(0, 0)
+  %tile_1_0 = aie.tile(1, 0)
+  %tile_0_2 = aie.tile(0, 2)
+  %tile_0_3 = aie.tile(0, 3)
+
+  aie.packet_flow(0) { 
+    aie.packet_source<%tile_0_2, Trace : 0> // core trace
+    aie.packet_dest<%tile_0_0, DMA : 1>
+  } {keep_pkt_header = true}
+  aie.packet_flow(1) { 
+    aie.packet_source<%tile_0_3, Trace : 0> // core trace
+    aie.packet_dest<%tile_1_0, DMA : 1>
+  } {keep_pkt_header = true}
+ }
+}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/README.md b/test/npu-xrt/matrix_multiplication_using_cascade/README.md
@@ -0,0 +1,38 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+## MM Cascade Design Example
+This is a matrix multiply example with the sizes of (16 * 16) * (16 * 16) and i32 data type, where four different versions are compared to examine the possibility of distributing K dim accross multiple cores.
+
+### Plainx1 Version<br>
+Generated from IREE end-to-end flow, using one core only.
+
+### Plainx4 Version<br>
+Using four cores, as output stationary 
+
+### Bufferx4 Version<br>
+With four cores chained horizontally, the intermediate accumulations are passed through shared buffers implemented as ObjectFIFO.
+
+### Cascadex4 Version<br>
+Still having four cores but the intermediate accumulations are communicated through the cascade port.
+
+### Results<br>
+From the trace files, 
+
+|           | Total  | Init  | Compute |
+|-----------|--------|-------|---------|
+| Plainx1   | 25.6us | 7.6us | 18.0us  |
+| Plainx4   | 6.7us  | 2.0us | 4.7us   |
+| Bufferx4  | 32.0us | 7.6us | 24.4us  |
+| Cascadex4 | 13.9us | 7.6us | 6.3us   |
+
+The Buffer version is slow because of frequent lock-related operations.
+
+The Cascade version almost halves the latency but with 4x cores. The performance gain is constrained by the initialization time of the accumulation buffer (depends on MxN only).