Skip to content

Commit

Permalink
[ASPLOS][WIP] Passthrough kernel in basic examples (Xilinx#1216)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and fifield committed Apr 12, 2024
1 parent c51c7aa commit 0300869
Show file tree
Hide file tree
Showing 16 changed files with 444 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,21 @@
#include <stdio.h>
#include <stdlib.h>

#define REL_WRITE 0
#define REL_READ 1

#include <aie_api/aie.hpp>

template <typename T, int N>
__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
const int32_t height,
const int32_t width) {
//::aie::vector<T, N> data_out;
//::aie::mask<N> temp_val;
event0();

v64uint8 *restrict outPtr = (v64uint8 *)out;
v64uint8 *restrict inPtr = (v64uint8 *)in;

for (int j = 0; j < (height * width); j += N) // Nx samples per loop
chess_prepare_for_pipelining chess_loop_range(6, ) {
//::aie::vector<T, N> tmpVector = ::aie::load_v(in);
//::aie::store_v(out, tmpVector);

*outPtr++ = *inPtr++;

// in += N;
// out += N;
}
chess_prepare_for_pipelining chess_loop_range(6, ) { *outPtr++ = *inPtr++; }

event1();
}

extern "C" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..

SHELL := /bin/bash

targetname = passThroughHardware
devicename = ipu
col = 0
targetname = passThroughDMAs
LENGTH ?= 4096

all: build/final.xclbin build/insts.txt
Expand Down Expand Up @@ -71,4 +69,4 @@ vck5000: build/aie.mlir


clean:
rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
rm -rf build _build inst ${targetname}.exe
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from aie.extras.context import mlir_mod_ctx

N = 4096
N_in_bytes = N * 4


# Deciphering the command line arguments
Expand Down Expand Up @@ -54,9 +53,8 @@ def device_body():
# Compute tile 2
@core(ComputeTile2)
def core_body():
tmp = memref.alloc(1, T.i32())
v0 = arith.constant(0, T.i32())
memref.store(v0, tmp, [0])
for _ in for_(sys.maxsize):
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())
Expand Down
75 changes: 75 additions & 0 deletions programming_examples/basic/passthrough_kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Xilinx Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif ()

set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC
PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
DISABLE_ABI_CHECK=1
)

target_include_directories (${currentTarget} PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../utils
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
49 changes: 49 additions & 0 deletions programming_examples/basic/passthrough_kernel/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../../makefile-common

VPATH := ../../../aie_kernels/aie_generic

PASSTHROUGH_SIZE = 4096

targetname = passThroughKernel

.PHONY: all template clean

all: build/final_${PASSTHROUGH_SIZE}.xclbin

build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
mkdir -p ${@D}
python3 $< ${PASSTHROUGH_SIZE} > $@

build/passThrough.cc.o: passThrough.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}

build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE

clean:
rm -rf build _build ${targetname}.exe
170 changes: 170 additions & 0 deletions programming_examples/basic/passthrough_kernel/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

N = 1024

if len(sys.argv) == 2:
N = int(sys.argv[1])

lineWidthInBytes = N // 4 # chop input in 4 sub-tensors
lineWidthInInt32s = lineWidthInBytes // 4

enableTrace = False
traceSizeInBytes = 8192
traceSizeInInt32s = traceSizeInBytes // 4


def passthroughKernel():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
# define types
memRef_ty = T.memref(lineWidthInBytes, T.ui8())

# AIE Core Function declarations
passThroughLine = external_func(
"passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()]
)

# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

if enableTrace:
flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2, "passThrough.cc.o")
def core_body():
for _ in for_(sys.maxsize):
elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
call(passThroughLine, [elemIn, elemOut, lineWidthInBytes])
of_in.release(ObjectFifoPort.Consume, 1)
of_out.release(ObjectFifoPort.Produce, 1)
yield_([])

# print(ctx.module.operation.verify())

tensorSize = N
tensorSizeInInt32s = tensorSize // 4
tensor_ty = T.memref(lineWidthInInt32s, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
if enableTrace:
# Trace output

# Trace_Event0, Trace_Event1: Select which events to trace.
# Note that the event buffers only appear to be transferred to DDR in
# bursts of 256 bytes. If less than 256 bytes are written, you may not
# see trace output, or only see it on the next iteration of your
# kernel invocation, as the buffer gets filled up. Note that, even
# though events are encoded as 4 byte words, it may take more than 64
# events to fill the buffer to 256 bytes and cause a flush, since
# multiple repeating events can be 'compressed' by the trace mechanism.
# In order to always generate sufficient events, we add the "assert
# TRUE" event to one slot, which fires every cycle, and thus fills our
# buffer quickly.

# Some events:
# TRUE (0x01)
# STREAM_STALL (0x18)
# LOCK_STALL (0x1A)
# EVENTS_CORE_INSTR_EVENT_1 (0x22)
# EVENTS_CORE_INSTR_EVENT_0 (0x21)
# INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction
# INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction
# INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction
# EVENTS_CORE_PORT_RUNNING_1 (0x4F)
# EVENTS_CORE_PORT_RUNNING_0 (0x4B)

# Trace_Event0 (4 slots)
IpuWrite32(0, 2, 0x340E0, 0x4B222125)
# Trace_Event1 (4 slots)
IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)

# Event slots as configured above:
# 0: Kernel executes vector instruction
# 1: Event 0 -- Kernel starts
# 2: Event 1 -- Kernel done
# 3: Port_Running_0
# 4: Port_Running_1
# 5: Lock Stall
# 6: Lock Acquire Instr
# 7: Lock Release Instr

# Stream_Switch_Event_Port_Selection_0
# This is necessary to capture the Port_Running_0 and Port_Running_1 events
IpuWrite32(0, 2, 0x3FF00, 0x121)

# Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
IpuWrite32(0, 2, 0x340D0, 0x10000)

# Start trace copy out.
IpuWriteBdShimTile(
bd_id=3,
buffer_length=traceSizeInBytes,
buffer_offset=tensorSize,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_stride=0,
d0_wrap=0,
d1_stride=0,
d1_wrap=0,
d2_stride=0,
ddr_id=2,
iteration_current=0,
iteration_stride=0,
iteration_wrap=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
IpuWrite32(0, 0, 0x1D20C, 0x3)

ipu_dma_memcpy_nd(
metadata="in",
bd_id=0,
mem=inTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_dma_memcpy_nd(
metadata="out",
bd_id=1,
mem=outTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


passthroughKernel()
12 changes: 12 additions & 0 deletions programming_examples/basic/passthrough_kernel/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, chess
//
// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o
// RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../utils %S/../../utils/xrtUtils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
// CHECK: PASS!

Loading

0 comments on commit 0300869

Please sign in to comment.