From cd2345a95a3589341b29ef5d4dd239762ca692b4 Mon Sep 17 00:00:00 2001
From: Kristof Denolf <kristof.denolf@amd.com>
Date: Wed, 10 Apr 2024 14:53:28 -0600
Subject: [PATCH] basic passthrough_kernel

---
 .../aie_generic}/passThrough.cc               |  13 +-
 .../basic/passthrough_kernel/CMakeLists.txt   |  75 ++++++++
 .../basic/passthrough_kernel/Makefile         |  49 +++++
 .../basic/passthrough_kernel/aie2.py          | 169 ++++++++++++++++++
 .../basic/passthrough_kernel/test.cpp         | 130 ++++++++++++++
 5 files changed, 424 insertions(+), 12 deletions(-)
 rename {programming_examples/vision/vision_kernels => aie_kernels/aie_generic}/passThrough.cc (86%)
 create mode 100644 programming_examples/basic/passthrough_kernel/CMakeLists.txt
 create mode 100644 programming_examples/basic/passthrough_kernel/Makefile
 create mode 100644 programming_examples/basic/passthrough_kernel/aie2.py
 create mode 100644 programming_examples/basic/passthrough_kernel/test.cpp
diff --git a/programming_examples/vision/vision_kernels/passThrough.cc b/aie_kernels/aie_generic/passThrough.cc
similarity index 86%
rename from programming_examples/vision/vision_kernels/passThrough.cc
rename to aie_kernels/aie_generic/passThrough.cc
index 0928af33f0..0fa7123ca9 100644
--- a/programming_examples/vision/vision_kernels/passThrough.cc
+++ b/aie_kernels/aie_generic/passThrough.cc
@@ -15,29 +15,18 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
-__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
+__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, 
                                                const int32_t height,
                                                const int32_t width) {
-  //::aie::vector<T, N> data_out;
-  //::aie::mask<N> temp_val;
   v64uint8 *restrict outPtr = (v64uint8 *)out;
   v64uint8 *restrict inPtr = (v64uint8 *)in;
 
   for (int j = 0; j < (height * width); j += N) // Nx samples per loop
     chess_prepare_for_pipelining chess_loop_range(6, ) {
-      //::aie::vector<T, N> tmpVector = ::aie::load_v(in);
-      //::aie::store_v(out, tmpVector);
-
       *outPtr++ = *inPtr++;
-
-      // in += N;
-      // out += N;
     }
 }
 
diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
new file mode 100644
index 0000000000..483fc84fb9
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Xilinx Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DOpenCV_DIR: Path to OpenCV install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/xrtUtils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
new file mode 100644
index 0000000000..9fea098d84
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -0,0 +1,49 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../makefile-common
+
+VPATH := ../../../aie_kernels/aie_generic
+
+PASSTHROUGH_SIZE = 4096
+
+targetname = passthrough_kernel
+
+.PHONY: all template clean
+
+all: build/final_${PASSTHROUGH_SIZE}.xclbin
+
+build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${PASSTHROUGH_SIZE} > $@
+
+build/passThrough.cc.o: passThrough.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+	
+build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
+	${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
new file mode 100644
index 0000000000..cb5877c4bc
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -0,0 +1,169 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+width = 1024  
+
+if len(sys.argv) == 2:
+    width = int(sys.argv[1])
+
+lineWidthInBytes = width
+lineWidthInInt32s = lineWidthInBytes // 4
+
+enableTrace = False
+traceSizeInBytes = 8192
+traceSizeInInt32s = traceSizeInBytes // 4
+
+
+def passThroughAIE2():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            # define types
+            line_ty = T.memref(lineWidthInBytes, T.ui8())
+
+            # AIE Core Function declarations
+            passThroughLine = external_func(
+                "passThroughLine", inputs=[line_ty, line_ty, T.i32()]
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            ComputeTile2 = tile(0, 2)
+
+            if enableTrace:
+                flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "passThrough.cc.o")
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(passThroughLine, [elemIn, elemOut, width])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            #    print(ctx.module.operation.verify())
+
+            tensorSize = width 
+            tensorSizeInInt32s = tensorSize // 4
+            tensor_ty = T.memref(tensorSizeInInt32s, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(inTensor, outTensor, notUsed):
+                if enableTrace:
+                    # Trace output
+
+                    # Trace_Event0, Trace_Event1: Select which events to trace.
+                    # Note that the event buffers only appear to be transferred to DDR in
+                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
+                    # see trace output, or only see it on the next iteration of your
+                    # kernel invocation, as the buffer gets filled up. Note that, even
+                    # though events are encoded as 4 byte words, it may take more than 64
+                    # events to fill the buffer to 256 bytes and cause a flush, since
+                    # multiple repeating events can be 'compressed' by the trace mechanism.
+                    # In order to always generate sufficient events, we add the "assert
+                    # TRUE" event to one slot, which fires every cycle, and thus fills our
+                    # buffer quickly.
+
+                    # Some events:
+                    # TRUE                       (0x01)
+                    # STREAM_STALL               (0x18)
+                    # LOCK_STALL                 (0x1A)
+                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
+                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
+                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock acquire instruction
+                    # INSTR_LOCK_RELEASE_REQ     (0x2D)  Core executes a lock release instruction
+                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+                    # Trace_Event0  (4 slots)
+                    IpuWrite32(0, 2, 0x340E0, 0x4B222125)
+                    # Trace_Event1  (4 slots)
+                    IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)
+
+                    # Event slots as configured above:
+                    # 0: Kernel executes vector instruction
+                    # 1: Event 0 -- Kernel starts
+                    # 2: Event 1 -- Kernel done
+                    # 3: Port_Running_0
+                    # 4: Port_Running_1
+                    # 5: Lock Stall
+                    # 6: Lock Acquire Instr
+                    # 7: Lock Release Instr
+
+                    # Stream_Switch_Event_Port_Selection_0
+                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
+                    IpuWrite32(0, 2, 0x3FF00, 0x121)
+
+                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+                    IpuWrite32(0, 2, 0x340D0, 0x10000)
+
+                    # Start trace copy out.
+                    IpuWriteBdShimTile(
+                        bd_id=3,
+                        buffer_length=traceSizeInBytes,
+                        buffer_offset=tensorSize,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_stride=0,
+                        d0_wrap=0,
+                        d1_stride=0,
+                        d1_wrap=0,
+                        d2_stride=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_stride=0,
+                        iteration_wrap=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    IpuWrite32(0, 0, 0x1D20C, 0x3)
+
+                ipu_dma_memcpy_nd(
+                    metadata="in",
+                    bd_id=0,
+                    mem=inTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="out",
+                    bd_id=1,
+                    mem=outTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+passThroughAIE2()
diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp
new file mode 100644
index 0000000000..f9fe19281f
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/test.cpp
@@ -0,0 +1,130 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "xrt/xrt_bo.h"
+#include "xrtUtils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using DATATYPE = std::uint8_t;
+#endif
+
+namespace po = boost::program_options;
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()
+    ("help,h", "produce help message")
+    ("xclbin,x", po::value<std::string>()->required(), "the input xclbin path")
+    ("kernel,k", po::value<std::string>()->required(), "the kernel name in the XCLBIN (for instance PP_PRE_FD)")
+    ("verbosity,v", po::value<int>()->default_value(0), "the verbosity of the output")
+    ("instr,i", po::value<std::string>()->required(), "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  try {
+    check_arg_file_exists(vm, "xclbin");
+    check_arg_file_exists(vm, "instr");
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+  }
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT context and load the kernel
+  xrt::device device;
+  xrt::kernel kernel;
+
+  initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(), vm["kernel"].as<std::string>());
+
+  // set up the buffer objects
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inA = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_out = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Copy instruction stream to xrt buffer object
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize buffer bo_inA
+  DATATYPE *bufInA = bo_inA.map<DATATYPE *>();
+  for (int i = 0; i < PASSTHROUGH_SIZE; i++)
+    bufInA[i] = i;
+
+  // Zero out buffer bo_out
+  DATATYPE *bufOut = bo_out.map<DATATYPE *>();
+  memset(bufOut, 0, PASSTHROUGH_SIZE * sizeof(DATATYPE));
+
+  // sync host to device memories
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // Execute the kernel and wait to finish
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
+  run.wait();
+
+  // Sync device to host memories
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  // Compare out to in
+  int numberOfDifferences = 0;
+  for(int i = 0; i < PASSTHROUGH_SIZE; i++) {
+    if(bufOut[i] != bufInA[i])
+      numberOfDifferences++;
+  }
+  
+  // Print Pass/Fail result of our test
+  int res = 0;
+  if (numberOfDifferences == 0) {
+    printf("PASS!\n");
+    res = 0;
+  } else {
+    printf("Fail!\n");
+    res = -1;
+  }
+
+  printf("Testing passThrough done!\n");
+  return res;
+}