Skip to content

Commit

Permalink
ReLU with tracing (Xilinx#1204)
Browse files Browse the repository at this point in the history
ReLU example with tracing

Co-authored-by: pjr <pjr@xilinx.com>
Co-authored-by: Joseph Melber <jgmelber@gmail.com>
  • Loading branch information
3 people authored and fifield committed Apr 10, 2024
1 parent 21ef061 commit b0a2697
Show file tree
Hide file tree
Showing 5 changed files with 698 additions and 0 deletions.
41 changes: 41 additions & 0 deletions aie_kernels/relu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===- scale.cc -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define __AIENGINE__ 2
#define NOCPP
#define __AIEARCH__ 20

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
const int v_factor = 32;
v32bfloat16 zeroes = broadcast_zero_bfloat16();

event0();
for (size_t i = 0; i < TILE_SIZE; i += v_factor)
chess_prepare_for_pipelining chess_loop_range(32, 32) {
v32bfloat16 input = *(v32bfloat16 *)(a + i);
v32bfloat16 output = max(input, zeroes);
*(v32bfloat16 *)(c + i) = output;
}
event1();
return;
}

extern "C" {

void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }

} // extern "C"
68 changes: 68 additions & 0 deletions programming_examples/basic/relu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 Advanced Micro Devices, Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif()

set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)

target_include_directories (${currentTarget} PUBLIC
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
54 changes: 54 additions & 0 deletions programming_examples/basic/relu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../makefile-common

targetname = testRelu

all: build/final.xclbin build/insts.txt

build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F}

build/aie.mlir: aie2.py
mkdir -p ${@D}
python3 $< > $@

build/final.xclbin: build/aie.mlir build/bf16_relu.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

run_g: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536


trace:
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json

clean_trace:
rm -rf tmpTrace trace.txt

clean: clean_trace
rm -rf build _build ${targetname}.exe

209 changes: 209 additions & 0 deletions programming_examples/basic/relu/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx


def my_relu():

word_size_in = 2
N = 65536
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

enable_tracing = True
trace_size = 65536

# Tile sizes
n = 1024
N_div_n = N // n

n_cores = 2
tiles = N_div_n // n_cores
buffer_depth = 2

with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.bf16())

# Type used in the tile memory
memRef_A_ty = T.memref(n, T.bf16())
memRef_C_ty = T.memref(n, T.bf16())

# Type used in the memory tile which aggregates across the 4 cores
memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())

# AIE Core Function declarations

bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])

# Tile declarations
ShimTile = tile(0, 0)

MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]

inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
outC_fifo_names = [f"memC{i}" for i in range(n_cores)]

inA_fifos = {}
outC_fifos = {}

# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
for i in range(n_cores):
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
)
object_fifo_link(inA, inA_fifo_names)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
object_fifo_link(outC_fifo_names[0:n_cores], outC)

# Set up a circuit-switched flow from core to shim for tracing information
if enable_tracing:
flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)

# Set up compute tiles
for i in range(n_cores):
# Compute tile i
@core(cores[i], "bf16_relu.o")
def core_body():
for _ in for_(0xFFFFFFFF):
for _ in for_(tiles):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
ObjectFifoPort.Produce, 1
)
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume, 1
)

call(bf16_relu, [elem_in_a, elem_out])

inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume, 1
)
outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce, 1
)
yield_([])
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
if enable_tracing:
# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=0,
row=2,
address=0x340D0,
value=0x00010000,
)
# 0x340D4: Trace Control 1
ipu_write32(
column=0,
row=2,
address=0x340D4,
value=0x00000000,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E0,
value=0x00222100,
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E4,
value=0x00000000,
)

ipu_write32(
column=0,
row=2,
address=0x3FF00,
value=0x00000121,
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory
output_size = N_in_bytes
ipu_writebd_shimtile(
bd_id=trace_bd_id,
buffer_length=trace_size,
buffer_offset=output_size,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
ddr_id=1,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
# Set start BD to our shim bd_Id (13)
ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)

ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
ipu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_relu()
Loading

0 comments on commit b0a2697

Please sign in to comment.