Skip to content

Commit

Permalink
basic passthrough_kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
denolf committed Apr 10, 2024
1 parent 48a8dfb commit cd2345a
Show file tree
Hide file tree
Showing 5 changed files with 424 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,18 @@
#include <stdio.h>
#include <stdlib.h>

#define REL_WRITE 0
#define REL_READ 1

#include <aie_api/aie.hpp>

template <typename T, int N>
__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
const int32_t height,
const int32_t width) {
//::aie::vector<T, N> data_out;
//::aie::mask<N> temp_val;
v64uint8 *restrict outPtr = (v64uint8 *)out;
v64uint8 *restrict inPtr = (v64uint8 *)in;

for (int j = 0; j < (height * width); j += N) // Nx samples per loop
chess_prepare_for_pipelining chess_loop_range(6, ) {
//::aie::vector<T, N> tmpVector = ::aie::load_v(in);
//::aie::store_v(out, tmpVector);

*outPtr++ = *inPtr++;

// in += N;
// out += N;
}
}

Expand Down
75 changes: 75 additions & 0 deletions programming_examples/basic/passthrough_kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 Xilinx Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DOpenCV_DIR: Path to OpenCV install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif ()

set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
${CMAKE_CURRENT_SOURCE_DIR}/../../utils/xrtUtils.cpp
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC
PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
DISABLE_ABI_CHECK=1
)

target_include_directories (${currentTarget} PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../utils
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
49 changes: 49 additions & 0 deletions programming_examples/basic/passthrough_kernel/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../makefile-common

VPATH := ../../../aie_kernels/aie_generic

PASSTHROUGH_SIZE = 4096

targetname = passthrough_kernel

.PHONY: all template clean

all: build/final_${PASSTHROUGH_SIZE}.xclbin

build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
mkdir -p ${@D}
python3 $< ${PASSTHROUGH_SIZE} > $@

build/passThrough.cc.o: passThrough.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}

build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE

clean:
rm -rf build _build ${targetname}.exe
169 changes: 169 additions & 0 deletions programming_examples/basic/passthrough_kernel/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

width = 1024

if len(sys.argv) == 2:
width = int(sys.argv[1])

lineWidthInBytes = width
lineWidthInInt32s = lineWidthInBytes // 4

enableTrace = False
traceSizeInBytes = 8192
traceSizeInInt32s = traceSizeInBytes // 4


def passThroughAIE2():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
# define types
line_ty = T.memref(lineWidthInBytes, T.ui8())

# AIE Core Function declarations
passThroughLine = external_func(
"passThroughLine", inputs=[line_ty, line_ty, T.i32()]
)

# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

if enableTrace:
flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2, "passThrough.cc.o")
def core_body():
for _ in for_(sys.maxsize):
elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
call(passThroughLine, [elemIn, elemOut, width])
of_in.release(ObjectFifoPort.Consume, 1)
of_out.release(ObjectFifoPort.Produce, 1)
yield_([])

# print(ctx.module.operation.verify())

tensorSize = width
tensorSizeInInt32s = tensorSize // 4
tensor_ty = T.memref(tensorSizeInInt32s, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
if enableTrace:
# Trace output

# Trace_Event0, Trace_Event1: Select which events to trace.
# Note that the event buffers only appear to be transferred to DDR in
# bursts of 256 bytes. If less than 256 bytes are written, you may not
# see trace output, or only see it on the next iteration of your
# kernel invocation, as the buffer gets filled up. Note that, even
# though events are encoded as 4 byte words, it may take more than 64
# events to fill the buffer to 256 bytes and cause a flush, since
# multiple repeating events can be 'compressed' by the trace mechanism.
# In order to always generate sufficient events, we add the "assert
# TRUE" event to one slot, which fires every cycle, and thus fills our
# buffer quickly.

# Some events:
# TRUE (0x01)
# STREAM_STALL (0x18)
# LOCK_STALL (0x1A)
# EVENTS_CORE_INSTR_EVENT_1 (0x22)
# EVENTS_CORE_INSTR_EVENT_0 (0x21)
# INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction
# INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction
# INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction
# EVENTS_CORE_PORT_RUNNING_1 (0x4F)
# EVENTS_CORE_PORT_RUNNING_0 (0x4B)

# Trace_Event0 (4 slots)
IpuWrite32(0, 2, 0x340E0, 0x4B222125)
# Trace_Event1 (4 slots)
IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)

# Event slots as configured above:
# 0: Kernel executes vector instruction
# 1: Event 0 -- Kernel starts
# 2: Event 1 -- Kernel done
# 3: Port_Running_0
# 4: Port_Running_1
# 5: Lock Stall
# 6: Lock Acquire Instr
# 7: Lock Release Instr

# Stream_Switch_Event_Port_Selection_0
# This is necessary to capture the Port_Running_0 and Port_Running_1 events
IpuWrite32(0, 2, 0x3FF00, 0x121)

# Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
IpuWrite32(0, 2, 0x340D0, 0x10000)

# Start trace copy out.
IpuWriteBdShimTile(
bd_id=3,
buffer_length=traceSizeInBytes,
buffer_offset=tensorSize,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_stride=0,
d0_wrap=0,
d1_stride=0,
d1_wrap=0,
d2_stride=0,
ddr_id=2,
iteration_current=0,
iteration_stride=0,
iteration_wrap=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
IpuWrite32(0, 0, 0x1D20C, 0x3)

ipu_dma_memcpy_nd(
metadata="in",
bd_id=0,
mem=inTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_dma_memcpy_nd(
metadata="out",
bd_id=1,
mem=outTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)

passThroughAIE2()
Loading

0 comments on commit cd2345a

Please sign in to comment.