Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tracing experiment #1218

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 11 additions & 76 deletions programming_examples/basic/passthrough_kernel/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx
from aie.trace_utils import *

N = 1024

Expand Down Expand Up @@ -43,7 +44,7 @@ def device_body():
ComputeTile2 = tile(0, 2)

if enableTrace:
flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)
flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
Expand Down Expand Up @@ -71,84 +72,18 @@ def core_body():
@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
if enableTrace:
# Trace output

# Trace_Event0, Trace_Event1: Select which events to trace.
# Note that the event buffers only appear to be transferred to DDR in
# bursts of 256 bytes. If less than 256 bytes are written, you may not
# see trace output, or only see it on the next iteration of your
# kernel invocation, as the buffer gets filled up. Note that, even
# though events are encoded as 4 byte words, it may take more than 64
# events to fill the buffer to 256 bytes and cause a flush, since
# multiple repeating events can be 'compressed' by the trace mechanism.
# In order to always generate sufficient events, we add the "assert
# TRUE" event to one slot, which fires every cycle, and thus fills our
# buffer quickly.

# Some events:
# TRUE (0x01)
# STREAM_STALL (0x18)
# LOCK_STALL (0x1A)
# EVENTS_CORE_INSTR_EVENT_1 (0x22)
# EVENTS_CORE_INSTR_EVENT_0 (0x21)
# INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction
# INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction
# INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction
# EVENTS_CORE_PORT_RUNNING_1 (0x4F)
# EVENTS_CORE_PORT_RUNNING_0 (0x4B)

# Trace_Event0 (4 slots)
IpuWrite32(0, 2, 0x340E0, 0x4B222125)
# Trace_Event1 (4 slots)
IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)

# Event slots as configured above:
# 0: Kernel executes vector instruction
# 1: Event 0 -- Kernel starts
# 2: Event 1 -- Kernel done
# 3: Port_Running_0
# 4: Port_Running_1
# 5: Lock Stall
# 6: Lock Acquire Instr
# 7: Lock Release Instr

# Stream_Switch_Event_Port_Selection_0
# This is necessary to capture the Port_Running_0 and Port_Running_1 events
IpuWrite32(0, 2, 0x3FF00, 0x121)

# Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
IpuWrite32(0, 2, 0x340D0, 0x10000)

# Start trace copy out.
IpuWriteBdShimTile(
configure_simple_tracing_aie2(
ComputeTile2,
ShimTile,
channel=1,
bd_id=3,
buffer_length=traceSizeInBytes,
buffer_offset=tensorSize,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_stride=0,
d0_wrap=0,
d1_stride=0,
d1_wrap=0,
d2_stride=0,
ddr_id=2,
iteration_current=0,
iteration_stride=0,
iteration_wrap=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
size=traceSizeInBytes,
offset=tensorSize,
start=0x1,
stop=0x0,
events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F],
)
IpuWrite32(0, 0, 0x1D20C, 0x3)

ipu_dma_memcpy_nd(
metadata="in",
Expand Down
88 changes: 11 additions & 77 deletions programming_examples/basic/vector_scalar/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

from aie.trace_utils import *

def my_vector_scalar():
N = 4096
Expand Down Expand Up @@ -77,85 +77,19 @@ def core_body():
@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
if enable_tracing:
# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340D0,
value=0x00010000,
)
# 0x340D4: Trace Control 1
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340D4,
value=0x00000000,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340E0,
value=0x4B222125,
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x340E4,
value=0x2D2C1A4F,
)

ipu_write32(
column=compute_tile2_col,
row=compute_tile2_row,
address=0x3FF00,
value=0x00000121,
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory
output_size = N_in_bytes
ipu_writebd_shimtile(
bd_id=trace_bd_id,
buffer_length=trace_size,
buffer_offset=output_size,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
configure_simple_tracing_aie2(
ComputeTile2,
ShimTile,
channel=1,
bd_id=13,
ddr_id=2,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
size=trace_size,
offset=N_in_bytes,
start=0x1,
stop=0x0,
events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F],
)
# Set start BD to our shim bd_Id (3)
ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)

ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
Expand Down
88 changes: 11 additions & 77 deletions programming_examples/ml/relu/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

from aie.trace_utils import *

def my_relu():

Expand Down Expand Up @@ -115,85 +115,19 @@ def core_body():
@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
if enable_tracing:
# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=0,
row=2,
address=0x340D0,
value=0x00010000,
)
# 0x340D4: Trace Control 1
ipu_write32(
column=0,
row=2,
address=0x340D4,
value=0x00000000,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E0,
value=0x00222100,
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E4,
value=0x00000000,
)

ipu_write32(
column=0,
row=2,
address=0x3FF00,
value=0x00000121,
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory
output_size = N_in_bytes
ipu_writebd_shimtile(
bd_id=trace_bd_id,
buffer_length=trace_size,
buffer_offset=output_size,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
configure_simple_tracing_aie2(
cores[0],
ShimTile,
channel=1,
bd_id=13,
ddr_id=1,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
size=trace_size,
offset=N_in_bytes,
start=0x1,
stop=0x0,
events=[0x22, 0x21, 0x00],
)
# Set start BD to our shim bd_Id (13)
ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)

ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
Expand Down
7 changes: 7 additions & 0 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ declare_mlir_python_sources(AIEPythonSources.Compiler
compiler/util.py
)

declare_mlir_python_sources(AIEPythonSources.Test
ADD_TO_PARENT AIEPythonSources
SOURCES
test_utils.py
trace_utils.py
)

if (AIE_ENABLE_XRT_PYTHON_BINDINGS)
declare_mlir_python_sources(AIEPythonSources.XRT
ADD_TO_PARENT AIEPythonSources
Expand Down
Loading
Loading