Merge branch 'asplos' into passthroughKernel

Xilinx · Apr 10, 2024 · ab84b08 · ab84b08
2 parents b17f178 + 121858c
commit ab84b08
Show file tree

Hide file tree

Showing 26 changed files with 2,410 additions and 35 deletions.
diff --git a/...ramming_examples/basic/eltwise_add/add.cc → aie_kernels/aie2/add.cc b/...ramming_examples/basic/eltwise_add/add.cc → aie_kernels/aie2/add.cc
diff --git a/aie_kernels/aie2/bf16_exp.cc b/aie_kernels/aie2/bf16_exp.cc
@@ -0,0 +1,23 @@
+#include <lut_based_ops.h>
+
+template <const int N>
+void exp_bf16_func(bfloat16 *restrict in, bfloat16 *restrict out) {
+
+  int vec_size = 16;
+  for (int i = 0; i < N; i += vec_size)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 vec_in = *(v16bfloat16 *)(in + i);
+      v16accfloat acc_exp = getExpBf16(vec_in);
+      v16bfloat16 bf16_exp = to_v16bfloat16(acc_exp);
+      *(v16bfloat16 *)(out + i) = bf16_exp;
+    }
+  return;
+}
+
+extern "C" {
+
+void exp_bf16_1024(bfloat16 *a_in, bfloat16 *c_out) {
+  exp_bf16_func<1024>(a_in, c_out);
+}
+
+} // extern "C"
diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc
@@ -0,0 +1,41 @@
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
+  const int v_factor = 32;
+  v32bfloat16 zeroes = broadcast_zero_bfloat16();
+
+  event0();
+  for (size_t i = 0; i < TILE_SIZE; i += v_factor)
+    chess_prepare_for_pipelining chess_loop_range(32, 32) {
+      v32bfloat16 input = *(v32bfloat16 *)(a + i);
+      v32bfloat16 output = max(input, zeroes);
+      *(v32bfloat16 *)(c + i) = output;
+    }
+  event1();
+  return;
+}
+
+extern "C" {
+
+void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }
+
+} // extern "C"
diff --git a/programming_examples/basic/eltwise_exp/CMakeLists.txt b/programming_examples/basic/eltwise_exp/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/eltwise_exp/Makefile b/programming_examples/basic/eltwise_exp/Makefile
@@ -0,0 +1,55 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/basic/makefile-common
+
+all: build/final.xclbin
+
+targetname = eltwise_exp
+
+build/lut_based_ops.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+
+build/exp.o: 
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c ${REPO_ROOT}/aie_kernels/aie2/bf16_exp.cc -o ${@F}
+
+build/kernels.a: build/exp.o build/lut_based_ops.o
+	ar rvs $@ $+
+
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/eltwise_exp/README.md b/programming_examples/basic/eltwise_exp/README.md
@@ -0,0 +1,13 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Eltwise Exp</ins>
+
+A simple element wise exponent function, using the look up table capabilities of the AI Engine
diff --git a/programming_examples/basic/eltwise_exp/aie2.py b/programming_examples/basic/eltwise_exp/aie2.py
@@ -0,0 +1,129 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import *  # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx  # mlir ctx wrapper
+
+from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
+from aie.dialects.scf import *  # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_eltwise_exp():
+
+    word_size_in = 2
+    N = 65536
+    N_in_bytes = N * word_size_in
+
+    A_sz_in_i32s = N_in_bytes // 4
+    C_sz_in_i32s = N_in_bytes // 4
+
+    # Tile sizes
+    n = 1024
+    N_div_n = N // n
+
+    n_cores = 4
+    tiles = N_div_n // n_cores
+    buffer_depth = 2
+
+    # ctx wrapper - to convert python to mlir
+    with mlir_mod_ctx() as ctx:
+
+        # Dvice declaration - aie2 device IPU (aka Ryzen AI)
+        @device(AIEDevice.ipu)
+        def device_body():
+
+            memRef_ty = T.memref(n, T.bf16())
+
+            # Type used in the tile memory
+            memRef_A_ty = T.memref(n, T.bf16())
+            memRef_C_ty = T.memref(n, T.bf16())
+
+            # Type used in the memory tile which aggregates across the 4 cores
+            memRef_A_MT_ty = T.memref(n * n_cores, T.bf16())
+            memRef_C_MT_ty = T.memref(n * n_cores, T.bf16())
+
+            # AIE Core Function declarations
+
+            exp_bf16_1024 = external_func(
+                "exp_bf16_1024", inputs=[memRef_ty, memRef_ty]
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+
+            MemTile = tile(0, 1)
+            cores = [tile(0, 2 + i) for i in range(n_cores)]
+
+            inA_fifo_names = [f"memA{i}" for i in range(n_cores)]
+            outC_fifo_names = [f"memC{i}" for i in range(n_cores)]
+
+            inA_fifos = {}
+            outC_fifos = {}
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, memRef_A_MT_ty)
+            for i in range(n_cores):
+                inA_fifos[inA_fifo_names[i]] = object_fifo(
+                    inA_fifo_names[i], MemTile, cores[i], buffer_depth, memRef_A_ty
+                )
+            object_fifo_link(inA, inA_fifo_names)
+
+            # Output C
+            for i in range(n_cores):
+                outC_fifos[outC_fifo_names[i]] = object_fifo(
+                    outC_fifo_names[i], cores[i], MemTile, buffer_depth, memRef_C_ty
+                )
+            outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, memRef_C_MT_ty)
+            object_fifo_link(outC_fifo_names[0:n_cores], outC)
+
+            # Compute tile bodies
+            for i in range(n_cores):
+                # Compute tile i
+                @core(cores[i], "kernels.a")
+                def core_body():
+                    for _ in for_(0xFFFFFFFF):
+                        for _ in for_(tiles):
+                            elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+
+                            call(exp_bf16_1024, [elem_in_a, elem_out])
+
+                            inA_fifos[inA_fifo_names[i]].release(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            outC_fifos[outC_fifo_names[i]].release(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            yield_([])
+                        yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(
+                    metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    # Print the mlir conversion
+    print(ctx.module)
+
+
+# Call design function to generate mlir code to stdout
+my_eltwise_exp()