Skip to content

Commit

Permalink
tracing experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
stephenneuendorffer committed Apr 10, 2024
1 parent a9a71b4 commit fca2795
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 79 deletions.
211 changes: 133 additions & 78 deletions programming_examples/basic/vector_scalar/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,130 @@
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

def pack4bytes(b3, b2, b1, b0):
w = (b3 & 0xFF) << 24
w |= (b2 & 0xFF) << 16
w |= (b1 & 0xFF) << 8
w |= (b0 & 0xFF) << 0
return w

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
# This is a very simple model of tracing, which has some big assumptions:
# 1) Trace data is collected over circuit switched connections, not packet-switched
# 2) A ShimDMA S2MM channel is dedicated to the trace data
# 3) Trace data is small enough to fit in a fixed-size buffer, which is collected with the
# outputs of the design
# 4) The usual model of '2 inputs, 1 output' is followed, and the
# trace data is appended to the other outputs

# tile: The tile we're tracing
# shim: The shim tile to output data with.
# bd_id: The BD in the shim tile to use.
# channel: The S2MM channel to use (0 or 1).
# size: The size of the trace data
# offset: The offset of the trace data in the (single) output buffer.
# start: The event number to start tracing on
# stop: The event number to stop tracing on
# events: A list of events to trace. Up to 8 events are allowed in aie2, more are ignored

# Event numbers should be less than 128.
# Big assumption: The bd_id and channel are unused. If they are used by something else, then
# everything will probably break.
def configure_simple_tracing_aie2(tile, shim, bd_id, channel, size, offset, start, stop, events):
assert(shim.isShimTile())

# Pad the input so we have exactly 8 events.
events = (events + [0] * 8)[:8]

# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x340D0,
value=pack4bytes(stop, start, 0, 0),
)
# 0x340D4: Trace Control 1
# This is used to control packet routing. For the moment
# only deal with the simple case of circuit routing.
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x340D4,
value=0,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x340E0,
value=pack4bytes(*events[0:3]),
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x340E4,
value=pack4bytes(*events[4:7]),
)

# 0x3FF00: Stream switch event port selection 0
def master(port):
return port | (1 << 5)
def slave(port):
return port
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x3FF00,
value=pack4bytes(0, 0, slave(1), master(1)), # port 1 is FIFO0?
)
ipu_write32(
column=tile.col(),
row=tile.row(),
address=0x3FF04,
value=pack4bytes(0, 0, 0, 0),
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
ipu_writebd_shimtile(
bd_id=bd_id,
buffer_length=size,
buffer_offset=offset,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=shim.col(),
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
# Assume using output buffer. This probably needs to be configurable.
ddr_id=2,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
# configure S2MM channel
ipu_write32(column=shim.col(), row=shim.row(), address=0x1D204 if channel == 0 else 0x1D20C, value=bd_id)

def my_vector_scalar():
N = 4096
Expand Down Expand Up @@ -77,85 +201,16 @@ def core_body():
@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
if enable_tracing:
# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340D0,
value=0x00010000,
)
# 0x340D4: Trace Control 1
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340D4,
value=0x00000000,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340E0,
value=0x4B222125,
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340E4,
value=0x2D2C1A4F,
)

ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x3FF00,
value=0x00000121,
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory
output_size = N_in_bytes
ipu_writebd_shimtile(
bd_id=trace_bd_id,
buffer_length=trace_size,
buffer_offset=output_size,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
ddr_id=2,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
# Set start BD to our shim bd_Id (3)
ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
configure_simple_tracing(ComputeTile2,
ShimTile,
bd_id=13,
channel=1,
size=trace_size,
offset=N_in_bytes,
start=0x1,
stop=0x0,
events={0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F})

ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
Expand Down
2 changes: 1 addition & 1 deletion programming_examples/basic/vector_scalar/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ int main(int argc, const char *argv[]) {
auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
Expand Down

0 comments on commit fca2795

Please sign in to comment.