Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ReLU with tracing #1204

Merged
merged 5 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions aie_kernels/relu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===- scale.cc -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define __AIENGINE__ 2
#define NOCPP
#define __AIEARCH__ 20

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
const int v_factor = 32;
v32bfloat16 zeroes = broadcast_zero_bfloat16();

event0();
for (size_t i = 0; i < TILE_SIZE; i += v_factor)
chess_prepare_for_pipelining chess_loop_range(32, 32) {
v32bfloat16 input = *(v32bfloat16 *)(a + i);
v32bfloat16 output = max(input, zeroes);
*(v32bfloat16 *)(c + i) = output;
}
event1();
return;
}

extern "C" {

void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }

} // extern "C"
68 changes: 68 additions & 0 deletions programming_examples/basic/relu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 Advanced Micro Devices, Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif()

set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)

target_include_directories (${currentTarget} PUBLIC
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
54 changes: 54 additions & 0 deletions programming_examples/basic/relu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../makefile-common

targetname = testRelu

all: build/final.xclbin build/insts.txt

build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F}

build/aie.mlir: aie2.py
mkdir -p ${@D}
python3 $< > $@

build/final.xclbin: build/aie.mlir build/bf16_relu.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

run_g: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536


trace:
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json

clean_trace:
rm -rf tmpTrace trace.txt

clean: clean_trace
rm -rf build _build ${targetname}.exe

209 changes: 209 additions & 0 deletions programming_examples/basic/relu/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx


def my_relu():

word_size_in = 2
N = 65536
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

enable_tracing = True
trace_size = 65536

# Tile sizes
n = 1024
N_div_n = N // n

n_cores = 2
tiles = N_div_n // n_cores
buffer_depth = 2

with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.bf16())

# Type used in the tile memory
memRef_A_ty = T.memref(n, T.bf16())
memRef_C_ty = T.memref(n, T.bf16())

# Type used in the memory tile which aggregates across the 4 cores
memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())

# AIE Core Function declarations

bf16_relu = external_func("bf16_relu", inputs=[memRef_ty, memRef_ty])

# Tile declarations
ShimTile = tile(0, 0)

MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]

inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
outC_fifo_names = [f"memC{i}" for i in range(n_cores)]

inA_fifos = {}
outC_fifos = {}

# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
for i in range(n_cores):
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
)
object_fifo_link(inA, inA_fifo_names)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
object_fifo_link(outC_fifo_names[0:n_cores], outC)

# Set up a circuit-switched flow from core to shim for tracing information
if enable_tracing:
flow(cores[0], WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)

# Set up compute tiles
for i in range(n_cores):
# Compute tile i
@core(cores[i], "bf16_relu.o")
def core_body():
for _ in for_(0xFFFFFFFF):
for _ in for_(tiles):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
ObjectFifoPort.Produce, 1
)
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume, 1
)

call(bf16_relu, [elem_in_a, elem_out])

inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume, 1
)
outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce, 1
)
yield_([])
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):

# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
if enable_tracing:
# 0x340D0: Trace Control 0
# 0xAABB---C
# AA <- Event to stop trace capture
# BB <- Event to start trace capture
# C <- Trace mode, 00=event=time, 01=event-PC, 10=execution
# Configure so that "Event 1" (always true) causes tracing to start
ipu_write32(
column=0,
row=2,
address=0x340D0,
value=0x00010000,
)
# 0x340D4: Trace Control 1
ipu_write32(
column=0,
row=2,
address=0x340D4,
value=0x00000000,
)
# 0x340E0: Trace Event Group 1 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E0,
value=0x00222100,
)
# 0x340E4: Trace Event Group 2 (Which events to trace)
# 0xAABBCCDD AA, BB, CC, DD <- four event slots
ipu_write32(
column=0,
row=2,
address=0x340E4,
value=0x00000000,
)

ipu_write32(
column=0,
row=2,
address=0x3FF00,
value=0x00000121,
)

# Configure a buffer descriptor to write tracing information that has been routed into this shim tile
# out to host DDR memory
trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory
output_size = N_in_bytes
ipu_writebd_shimtile(
bd_id=trace_bd_id,
buffer_length=trace_size,
buffer_offset=output_size,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_size=0,
d0_stride=0,
d1_size=0,
d1_stride=0,
d2_stride=0,
ddr_id=1,
iteration_current=0,
iteration_size=0,
iteration_stride=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
# Set start BD to our shim bd_Id (13)
ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)

ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
ipu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_relu()
Loading
Loading