Skip to content

Commit

Permalink
Merge branch 'asplos' into passthroughKernel
Browse files Browse the repository at this point in the history
  • Loading branch information
denolf authored Apr 10, 2024
2 parents b17f178 + 121858c commit ab84b08
Show file tree
Hide file tree
Showing 26 changed files with 2,410 additions and 35 deletions.
File renamed without changes.
23 changes: 23 additions & 0 deletions aie_kernels/aie2/bf16_exp.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <lut_based_ops.h>

template <const int N>
void exp_bf16_func(bfloat16 *restrict in, bfloat16 *restrict out) {

int vec_size = 16;
for (int i = 0; i < N; i += vec_size)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 vec_in = *(v16bfloat16 *)(in + i);
v16accfloat acc_exp = getExpBf16(vec_in);
v16bfloat16 bf16_exp = to_v16bfloat16(acc_exp);
*(v16bfloat16 *)(out + i) = bf16_exp;
}
return;
}

extern "C" {

void exp_bf16_1024(bfloat16 *a_in, bfloat16 *c_out) {
exp_bf16_func<1024>(a_in, c_out);
}

} // extern "C"
41 changes: 41 additions & 0 deletions aie_kernels/relu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===- scale.cc -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define __AIENGINE__ 2
#define NOCPP
#define __AIEARCH__ 20

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
const int v_factor = 32;
v32bfloat16 zeroes = broadcast_zero_bfloat16();

event0();
for (size_t i = 0; i < TILE_SIZE; i += v_factor)
chess_prepare_for_pipelining chess_loop_range(32, 32) {
v32bfloat16 input = *(v32bfloat16 *)(a + i);
v32bfloat16 output = max(input, zeroes);
*(v32bfloat16 *)(c + i) = output;
}
event1();
return;
}

extern "C" {

void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }

} // extern "C"
69 changes: 69 additions & 0 deletions programming_examples/basic/eltwise_exp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 Advanced Micro Devices, Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif()

set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)

target_include_directories (${currentTarget} PUBLIC
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
../../../programming_examples/utils
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
55 changes: 55 additions & 0 deletions programming_examples/basic/eltwise_exp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../../../programming_examples/basic/makefile-common

all: build/final.xclbin

targetname = eltwise_exp

build/lut_based_ops.o:
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}

build/exp.o:
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c ${REPO_ROOT}/aie_kernels/aie2/bf16_exp.cc -o ${@F}

build/kernels.a: build/exp.o build/lut_based_ops.o
ar rvs $@ $+


build/aie.mlir: aie2.py
mkdir -p ${@D}
python3 $< > $@

build/final.xclbin: build/aie.mlir
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
# cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

run_py: build/final.xclbin build/insts.txt
${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

clean:
rm -rf build _build ${targetname}.exe
13 changes: 13 additions & 0 deletions programming_examples/basic/eltwise_exp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!---//===- README.md --------------------------*- Markdown -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2022, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//-->

# <ins>Eltwise Exp</ins>

A simple element wise exponent function, using the look up table capabilities of the AI Engine
129 changes: 129 additions & 0 deletions programming_examples/basic/eltwise_exp/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

from aie.dialects.aie import * # primary mlir-aie dialect definitions
from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper

from aie.dialects.aiex import * # extended mlir-aie dialect definitions
from aie.dialects.scf import * # scf (strcutred control flow) dialect
from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects


# AI Engine structural design function
def my_eltwise_exp():

word_size_in = 2
N = 65536
N_in_bytes = N * word_size_in

A_sz_in_i32s = N_in_bytes // 4
C_sz_in_i32s = N_in_bytes // 4

# Tile sizes
n = 1024
N_div_n = N // n

n_cores = 4
tiles = N_div_n // n_cores
buffer_depth = 2

# ctx wrapper - to convert python to mlir
with mlir_mod_ctx() as ctx:

# Dvice declaration - aie2 device IPU (aka Ryzen AI)
@device(AIEDevice.ipu)
def device_body():

memRef_ty = T.memref(n, T.bf16())

# Type used in the tile memory
memRef_A_ty = T.memref(n, T.bf16())
memRef_C_ty = T.memref(n, T.bf16())

# Type used in the memory tile which aggregates across the 4 cores
memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())

# AIE Core Function declarations

exp_bf16_1024 = external_func(
"exp_bf16_1024", inputs=[memRef_ty, memRef_ty]
)

# Tile declarations
ShimTile = tile(0, 0)

MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]

inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
outC_fifo_names = [f"memC{i}" for i in range(n_cores)]

inA_fifos = {}
outC_fifos = {}

# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
for i in range(n_cores):
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
)
object_fifo_link(inA, inA_fifo_names)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
object_fifo_link(outC_fifo_names[0:n_cores], outC)

# Compute tile bodies
for i in range(n_cores):
# Compute tile i
@core(cores[i], "kernels.a")
def core_body():
for _ in for_(0xFFFFFFFF):
for _ in for_(tiles):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
ObjectFifoPort.Produce, 1
)
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume, 1
)

call(exp_bf16_1024, [elem_in_a, elem_out])

inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume, 1
)
outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce, 1
)
yield_([])
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty)
def sequence(A, C):
ipu_dma_memcpy_nd(
metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
)
ipu_dma_memcpy_nd(
metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

# Print the mlir conversion
print(ctx.module)


# Call design function to generate mlir code to stdout
my_eltwise_exp()
Loading

0 comments on commit ab84b08

Please sign in to comment.