[ASPLOS][WIP] Passthrough kernel in basic examples (Xilinx#1216)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
fifield · Apr 12, 2024 · 0300869 · 0300869
1 parent c51c7aa
commit 0300869
Show file tree

Hide file tree

Showing 16 changed files with 444 additions and 26 deletions.
diff --git a/...ples/vision/vision_kernels/passThrough.cc → aie_kernels/aie_generic/passThrough.cc b/...ples/vision/vision_kernels/passThrough.cc → aie_kernels/aie_generic/passThrough.cc
@@ -15,30 +15,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
 __attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
                                                const int32_t height,
                                                const int32_t width) {
-  //::aie::vector<T, N> data_out;
-  //::aie::mask<N> temp_val;
+  event0();
+
   v64uint8 *restrict outPtr = (v64uint8 *)out;
   v64uint8 *restrict inPtr = (v64uint8 *)in;
 
   for (int j = 0; j < (height * width); j += N) // Nx samples per loop
-    chess_prepare_for_pipelining chess_loop_range(6, ) {
-      //::aie::vector<T, N> tmpVector = ::aie::load_v(in);
-      //::aie::store_v(out, tmpVector);
-
-      *outPtr++ = *inPtr++;
-
-      // in += N;
-      // out += N;
-    }
+    chess_prepare_for_pipelining chess_loop_range(6, ) { *outPtr++ = *inPtr++; }
+
+  event1();
 }
 
 extern "C" {

diff --git a/...basic/passthrough_hardware/CMakeLists.txt → ...les/basic/passthrough_dmas/CMakeLists.txt b/...basic/passthrough_hardware/CMakeLists.txt → ...les/basic/passthrough_dmas/CMakeLists.txt
diff --git a/...mples/basic/passthrough_hardware/Makefile → ..._examples/basic/passthrough_dmas/Makefile b/...mples/basic/passthrough_hardware/Makefile → ..._examples/basic/passthrough_dmas/Makefile
@@ -12,9 +12,7 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..
 
 SHELL := /bin/bash
 
-targetname = passThroughHardware
-devicename = ipu
-col = 0
+targetname = passThroughDMAs
 LENGTH ?= 4096
 
 all: build/final.xclbin build/insts.txt
@@ -71,4 +69,4 @@ vck5000: build/aie.mlir
 
 
 clean:
-	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
+	rm -rf build _build inst ${targetname}.exe
diff --git a/...amples/basic/passthrough_hardware/aie2.py → ...g_examples/basic/passthrough_dmas/aie2.py b/...amples/basic/passthrough_hardware/aie2.py → ...g_examples/basic/passthrough_dmas/aie2.py
@@ -14,7 +14,6 @@
 from aie.extras.context import mlir_mod_ctx
 
 N = 4096
-N_in_bytes = N * 4
 
 
 # Deciphering the command line arguments 
@@ -54,9 +53,8 @@ def device_body():
             # Compute tile 2
             @core(ComputeTile2)
             def core_body():
-                tmp = memref.alloc(1, T.i32())
-                v0 = arith.constant(0, T.i32())
-                memref.store(v0, tmp, [0])
+                for _ in for_(sys.maxsize):
+                    yield_([])
 
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())

diff --git a/...amples/basic/passthrough_hardware/run.lit → ...g_examples/basic/passthrough_dmas/run.lit b/...amples/basic/passthrough_hardware/run.lit → ...g_examples/basic/passthrough_dmas/run.lit
diff --git a/...asic/passthrough_hardware/run_vck5000.lit → ...es/basic/passthrough_dmas/run_vck5000.lit b/...asic/passthrough_hardware/run_vck5000.lit → ...es/basic/passthrough_dmas/run_vck5000.lit
diff --git a/...mples/basic/passthrough_hardware/test.cpp → ..._examples/basic/passthrough_dmas/test.cpp b/...mples/basic/passthrough_hardware/test.cpp → ..._examples/basic/passthrough_dmas/test.cpp
diff --git a/...sic/passthrough_hardware/test_vck5000.cpp → ...s/basic/passthrough_dmas/test_vck5000.cpp b/...sic/passthrough_hardware/test_vck5000.cpp → ...s/basic/passthrough_dmas/test_vck5000.cpp
diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Xilinx Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
@@ -0,0 +1,49 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+VPATH := ../../../aie_kernels/aie_generic
+
+PASSTHROUGH_SIZE = 4096
+
+targetname = passThroughKernel
+
+.PHONY: all template clean
+
+all: build/final_${PASSTHROUGH_SIZE}.xclbin
+
+build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${PASSTHROUGH_SIZE} > $@
+
+build/passThrough.cc.o: passThrough.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+
+build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
+	${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -0,0 +1,170 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+N = 1024
+
+if len(sys.argv) == 2:
+    N = int(sys.argv[1])
+
+lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
+lineWidthInInt32s = lineWidthInBytes // 4
+
+enableTrace = False
+traceSizeInBytes = 8192
+traceSizeInInt32s = traceSizeInBytes // 4
+
+
+def passthroughKernel():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            # define types
+            memRef_ty = T.memref(lineWidthInBytes, T.ui8())
+
+            # AIE Core Function declarations
+            passThroughLine = external_func(
+                "passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()]
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            ComputeTile2 = tile(0, 2)
+
+            if enableTrace:
+                flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "passThrough.cc.o")
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(passThroughLine, [elemIn, elemOut, lineWidthInBytes])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            #    print(ctx.module.operation.verify())
+
+            tensorSize = N
+            tensorSizeInInt32s = tensorSize // 4
+            tensor_ty = T.memref(lineWidthInInt32s, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(inTensor, outTensor, notUsed):
+                if enableTrace:
+                    # Trace output
+
+                    # Trace_Event0, Trace_Event1: Select which events to trace.
+                    # Note that the event buffers only appear to be transferred to DDR in
+                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
+                    # see trace output, or only see it on the next iteration of your
+                    # kernel invocation, as the buffer gets filled up. Note that, even
+                    # though events are encoded as 4 byte words, it may take more than 64
+                    # events to fill the buffer to 256 bytes and cause a flush, since
+                    # multiple repeating events can be 'compressed' by the trace mechanism.
+                    # In order to always generate sufficient events, we add the "assert
+                    # TRUE" event to one slot, which fires every cycle, and thus fills our
+                    # buffer quickly.
+
+                    # Some events:
+                    # TRUE                       (0x01)
+                    # STREAM_STALL               (0x18)
+                    # LOCK_STALL                 (0x1A)
+                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
+                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
+                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock acquire instruction
+                    # INSTR_LOCK_RELEASE_REQ     (0x2D)  Core executes a lock release instruction
+                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+                    # Trace_Event0  (4 slots)
+                    IpuWrite32(0, 2, 0x340E0, 0x4B222125)
+                    # Trace_Event1  (4 slots)
+                    IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)
+
+                    # Event slots as configured above:
+                    # 0: Kernel executes vector instruction
+                    # 1: Event 0 -- Kernel starts
+                    # 2: Event 1 -- Kernel done
+                    # 3: Port_Running_0
+                    # 4: Port_Running_1
+                    # 5: Lock Stall
+                    # 6: Lock Acquire Instr
+                    # 7: Lock Release Instr
+
+                    # Stream_Switch_Event_Port_Selection_0
+                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
+                    IpuWrite32(0, 2, 0x3FF00, 0x121)
+
+                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+                    IpuWrite32(0, 2, 0x340D0, 0x10000)
+
+                    # Start trace copy out.
+                    IpuWriteBdShimTile(
+                        bd_id=3,
+                        buffer_length=traceSizeInBytes,
+                        buffer_offset=tensorSize,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_stride=0,
+                        d0_wrap=0,
+                        d1_stride=0,
+                        d1_wrap=0,
+                        d2_stride=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_stride=0,
+                        iteration_wrap=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    IpuWrite32(0, 0, 0x1D20C, 0x3)
+
+                ipu_dma_memcpy_nd(
+                    metadata="in",
+                    bd_id=0,
+                    mem=inTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="out",
+                    bd_id=1,
+                    mem=outTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+passthroughKernel()
diff --git a/programming_examples/basic/passthrough_kernel/run.lit b/programming_examples/basic/passthrough_kernel/run.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o
+// RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../utils %S/../../utils/xrtUtils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+