Skip to content

Commit

Permalink
Matmul cascade (#1465)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen authored and singagan committed Jun 5, 2024
1 parent 3709f8e commit 67b26ff
Show file tree
Hide file tree
Showing 12 changed files with 2,681 additions and 15 deletions.
34 changes: 22 additions & 12 deletions lib/Dialect/AIE/Transforms/AIECreatePacketFlows.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,17 @@ void updateCoordinates(int &xCur, int &yCur, WireBundle move) {
// Build a packet-switched route from the sourse to the destination with the
// given ID. The route is recorded in the given map of switchboxes.
void buildPSRoute(
int xSrc, int ySrc, Port sourcePort, int xDest, int yDest, Port destPort,
int flowID,
TileOp srcTile, Port sourcePort, TileOp destTile, Port destPort, int flowID,
DenseMap<TileID, SmallVector<std::pair<Connect, int>, 8>> &switchboxes,
bool reverseOrder = false) {

int xSrc = srcTile.colIndex();
int ySrc = srcTile.rowIndex();
int xDest = destTile.colIndex();
int yDest = destTile.rowIndex();

const auto &targetModel = getTargetModel(srcTile);

int xCur = xSrc;
int yCur = ySrc;
WireBundle curBundle = {};
Expand Down Expand Up @@ -213,6 +220,13 @@ void buildPSRoute(
if (move == lastBundle)
continue;

// If the source port is a trace port, we need to validate the destination
if (xCur == xSrc && yCur == ySrc &&
sourcePort.bundle == WireBundle::Trace &&
!targetModel.isValidTraceMaster(xSrc, ySrc, move, curChannel)) {
continue;
}

updateCoordinates(xCur, yCur, move);

if (std::find(congestion.begin(), congestion.end(), TileID{xCur, yCur}) !=
Expand Down Expand Up @@ -320,22 +334,18 @@ struct AIERoutePacketFlowsPass
Region &r = pktflow.getPorts();
Block &b = r.front();
int flowID = pktflow.IDInt();
int xSrc = 0, ySrc = 0;
Port sourcePort;
Port sourcePort, destPort;
TileOp srcTile, destTile;

for (Operation &Op : b.getOperations()) {
if (auto pktSource = dyn_cast<PacketSourceOp>(Op)) {
auto srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
xSrc = srcTile.colIndex();
ySrc = srcTile.rowIndex();
srcTile = dyn_cast<TileOp>(pktSource.getTile().getDefiningOp());
sourcePort = pktSource.port();
} else if (auto pktDest = dyn_cast<PacketDestOp>(Op)) {
auto destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
int xDest = destTile.colIndex();
int yDest = destTile.rowIndex();
Port destPort = pktDest.port();
destTile = dyn_cast<TileOp>(pktDest.getTile().getDefiningOp());
destPort = pktDest.port();

buildPSRoute(xSrc, ySrc, sourcePort, xDest, yDest, destPort, flowID,
buildPSRoute(srcTile, sourcePort, destTile, destPort, flowID,
switchboxes, true);

// Assign "keep_pkt_header flag"
Expand Down
64 changes: 61 additions & 3 deletions programming_examples/utils/parse_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,8 +702,42 @@ def parse_mlir_trace_events(lines):
pid_events[1][key][5] = (value >> 8) & 0xFF
pid_events[1][key][6] = (value >> 16) & 0xFF
pid_events[1][key][7] = (value >> 24) & 0xFF

# TODO intfc and memtile event 0, 1 needs to also be defined
# memtile event 0
elif address == 0x940E0: # 606432
if pid_events[3].get(key) == None:
pid_events[3][key] = [
0,
0,
0,
0,
0,
0,
0,
0,
] # TODO no better way to init this?
# print("Trace event 0 configured to be ",hex(value))
pid_events[3][key][0] = value & 0xFF
pid_events[3][key][1] = (value >> 8) & 0xFF
pid_events[3][key][2] = (value >> 16) & 0xFF
pid_events[3][key][3] = (value >> 24) & 0xFF
# memtile event 1
elif address == 0x940E4: # 606436
if pid_events[3].get(key) == None:
pid_events[3][key] = [
0,
0,
0,
0,
0,
0,
0,
0,
] # TODO no better way to init this?
pid_events[3][key][4] = value & 0xFF
pid_events[3][key][5] = (value >> 8) & 0xFF
pid_events[3][key][6] = (value >> 16) & 0xFF
pid_events[3][key][7] = (value >> 24) & 0xFF
# TODO intfc event 0, 1 needs to also be defined

# print("Found labels:\n")
# for j in pid_events:
Expand Down Expand Up @@ -750,7 +784,9 @@ def lookup_event_name_by_type(trace_type, code):
# Mem traces
elif trace_type == 1:
# TODO Need to define these
if code == 21: # x15
if code == 0x1:
event = "True"
elif code == 21: # x15
event = "DMA s2mm 0 start bd"
elif code == 22: # x16
event = "DMA s2mm 1 start bd"
Expand Down Expand Up @@ -780,6 +816,28 @@ def lookup_event_name_by_type(trace_type, code):
event = "DMA s2mm 1 stalled lock acquire"
else:
event = "Unknown"
# memtile traces
elif trace_type == 3:
if code == 0x1:
event = "True"
elif code == 80: # 0x50
event = "PortRunning0"
elif code == 84: # 0x54
event = "PortRunning1"
elif code == 88: # 0x58
event = "PortRunning2"
elif code == 92: # 0x5C
event = "PortRunning3"
elif code == 96: # 0x60
event = "PortRunning4"
elif code == 100: # 0x64
event = "PortRunning5"
elif code == 104: # 0x68
event = "PortRunning6"
elif code == 108: # 0x6C
event = "PortRunning7"
else:
event = "Unknown"
else:
event = "Unknown"
return event
Expand Down
28 changes: 28 additions & 0 deletions test/create-packet-flows/trace_packet_routing.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//===- trace_packet_routing.mlir ------------------------------------------------*- MLIR -*-===//
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
//===----------------------------------------------------------------------===//
// REQUIRES: ryzen_ai, chess

// RUN: aie-opt --aie-create-packet-flows %s | FileCheck %s
// CHECK-LABEL: module @trace_packet_routing {

module @trace_packet_routing {
aie.device(npu1_4col) {
%tile_0_0 = aie.tile(0, 0)
%tile_1_0 = aie.tile(1, 0)
%tile_0_2 = aie.tile(0, 2)
%tile_0_3 = aie.tile(0, 3)

aie.packet_flow(0) {
aie.packet_source<%tile_0_2, Trace : 0> // core trace
aie.packet_dest<%tile_0_0, DMA : 1>
} {keep_pkt_header = true}
aie.packet_flow(1) {
aie.packet_source<%tile_0_3, Trace : 0> // core trace
aie.packet_dest<%tile_1_0, DMA : 1>
} {keep_pkt_header = true}
}
}
38 changes: 38 additions & 0 deletions test/npu-xrt/matrix_multiplication_using_cascade/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<!---//===- README.md --------------------------*- Markdown -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//-->

## MM Cascade Design Example
This is a matrix multiply example with the sizes of (16 * 16) * (16 * 16) and i32 data type, where four different versions are compared to examine the possibility of distributing K dim accross multiple cores.

### Plainx1 Version<br>
Generated from IREE end-to-end flow, using one core only.

### Plainx4 Version<br>
Using four cores, as output stationary

### Bufferx4 Version<br>
With four cores chained horizontally, the intermediate accumulations are passed through shared buffers implemented as ObjectFIFO.

### Cascadex4 Version<br>
Still having four cores but the intermediate accumulations are communicated through the cascade port.

### Results<br>
From the trace files,

| | Total | Init | Compute |
|-----------|--------|-------|---------|
| Plainx1 | 25.6us | 7.6us | 18.0us |
| Plainx4 | 6.7us | 2.0us | 4.7us |
| Bufferx4 | 32.0us | 7.6us | 24.4us |
| Cascadex4 | 13.9us | 7.6us | 6.3us |

The Buffer version is slow because of frequent lock-related operations.

The Cascade version almost halves the latency but with 4x cores. The performance gain is constrained by the initialization time of the accumulation buffer (depends on MxN only).
Loading

0 comments on commit 67b26ff

Please sign in to comment.