From c9046f7782815d9a21718e4f96e33aa5d94bdd46 Mon Sep 17 00:00:00 2001 From: Erika Hunhoff <54562339+hunhoffe@users.noreply.github.com> Date: Tue, 28 May 2024 09:48:38 -0600 Subject: [PATCH] Fix minor programming guide bugs, add programming guide lit tests to CI workflows (#1509) --- .github/workflows/buildAndTest.yml | 2 + .github/workflows/buildAndTestMulti.yml | 1 + .github/workflows/buildAndTestPythons.yml | 2 + .github/workflows/buildAndTestRyzenAI.yml | 1 + README.md | 2 +- programming_guide/section-1/README.md | 8 +- .../section-2b/02_Broadcast/README.md | 2 +- .../section-2/section-2d/Makefile | 24 +++ .../section-2/section-2d/aie2.py | 6 +- .../section-2/section-2d/aie2_multi.py | 6 +- .../section-2/section-2d/run_makefile.lit | 7 + .../01_single_double_buffer/Makefile | 20 ++ .../01_single_double_buffer/run_makefile.lit | 7 + .../01_single_double_buffer/single_buffer.py | 8 +- .../02_external_mem_to_core/Makefile | 12 +- .../02_external_mem_to_core/ext_to_core.py | 8 +- .../02_external_mem_to_core/run.lit | 10 - .../02_external_mem_to_core/run_makefile.lit | 9 + .../03_external_mem_to_core_L2/Makefile | 12 +- .../ext_to_core_L2.py | 8 +- .../03_external_mem_to_core_L2/run.lit | 10 - .../run_makefile.lit | 9 + .../04_distribute_L2/CMakeLists.txt | 75 +++++++ .../section-2e/04_distribute_L2/Makefile | 43 ++++ .../04_distribute_L2/distribute_L2.py | 10 +- .../04_distribute_L2/run_makefile.lit | 9 + .../section-2e/04_distribute_L2/test.cpp | 190 ++++++++++++++++++ .../section-2/section-2e/05_join_L2/Makefile | 12 +- .../05_join_L2/distribute_and_join_L2.py | 2 +- .../section-2e/05_join_L2/join_L2.py | 8 +- .../section-2/section-2e/05_join_L2/run.lit | 10 - .../section-2e/05_join_L2/run_makefile.lit | 9 + programming_guide/section-3/README.md | 12 +- programming_guide/section-3/aie2.py | 2 +- .../section-4/section-4a/Makefile | 2 +- .../section-4/section-4a/README.md | 2 +- .../section-4/section-4a/aie2.py | 2 +- .../section-4/section-4b/Makefile | 2 +- .../section-4/section-4b/aie2.py | 2 +- 39 files changed, 489 insertions(+), 77 deletions(-) create mode 100644 programming_guide/section-2/section-2d/Makefile create mode 100644 programming_guide/section-2/section-2d/run_makefile.lit create mode 100644 programming_guide/section-2/section-2e/01_single_double_buffer/Makefile create mode 100644 programming_guide/section-2/section-2e/01_single_double_buffer/run_makefile.lit delete mode 100644 programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit create mode 100644 programming_guide/section-2/section-2e/02_external_mem_to_core/run_makefile.lit delete mode 100644 programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit create mode 100644 programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run_makefile.lit create mode 100644 programming_guide/section-2/section-2e/04_distribute_L2/CMakeLists.txt create mode 100644 programming_guide/section-2/section-2e/04_distribute_L2/Makefile create mode 100644 programming_guide/section-2/section-2e/04_distribute_L2/run_makefile.lit create mode 100644 programming_guide/section-2/section-2e/04_distribute_L2/test.cpp delete mode 100644 programming_guide/section-2/section-2e/05_join_L2/run.lit create mode 100644 programming_guide/section-2/section-2e/05_join_L2/run_makefile.lit diff --git a/.github/workflows/buildAndTest.yml b/.github/workflows/buildAndTest.yml index f67d5d4077..623ab239d1 100644 --- a/.github/workflows/buildAndTest.yml +++ b/.github/workflows/buildAndTest.yml @@ -130,6 +130,7 @@ jobs: ninja check-aie ninja check-tutorials ninja check-reference-designs + ninja check-programming-guide # Build the repo test target in release mode to build and test. - name: Build and test (Release) @@ -165,3 +166,4 @@ jobs: ninja check-aie ninja check-tutorials ninja check-reference-designs + ninja check-programming-guide diff --git a/.github/workflows/buildAndTestMulti.yml b/.github/workflows/buildAndTestMulti.yml index 971f3c15c6..ae44f36896 100644 --- a/.github/workflows/buildAndTestMulti.yml +++ b/.github/workflows/buildAndTestMulti.yml @@ -150,4 +150,5 @@ jobs: ninja check-aie ninja check-tutorials ninja check-reference-designs + ninja check-programming-guide fi diff --git a/.github/workflows/buildAndTestPythons.yml b/.github/workflows/buildAndTestPythons.yml index e0e3b6f046..2e7f4d5c08 100644 --- a/.github/workflows/buildAndTestPythons.yml +++ b/.github/workflows/buildAndTestPythons.yml @@ -130,6 +130,7 @@ jobs: ninja check-aie ninja check-tutorials ninja check-reference-designs + ninja check-programming-guide # Build the repo test target in release mode to build and test. - name: Build and test (Release) @@ -164,3 +165,4 @@ jobs: ninja check-aie ninja check-tutorials ninja check-reference-designs + ninja check-programming-guide diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml index e2e7bee451..bbd0c4cae3 100644 --- a/.github/workflows/buildAndTestRyzenAI.yml +++ b/.github/workflows/buildAndTestRyzenAI.yml @@ -164,5 +164,6 @@ jobs: ninja install ninja check-reference-designs + ninja check-programming-guide popd diff --git a/README.md b/README.md index cbd8d78088..1aa78b847c 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ![](https://mlir.llvm.org//mlir-logo.png) -This repository contains an [MLIR-based](https://mlir.llvm.org/) toolchain for AI Engine-enabled devices, such as [AMD Ryzen™ AI](https://www.amd.com/en/products/ryzen-ai) and [Versal™](https://www.xilinx.com/products/technology/ai-engine.html). This repository can be used to generate low-level configurations for the AI Engine portion of these devices. AI Engines are organized as a spatial array of tiles, where each tile contains AI Engine cores and/or memories. The spatial array is connected by stream switches that can be configured to route data between AI Engine tiles scheduled by their programmable Data Movement Accelerators (DMAs). This repository contains MLIR representations, with multiple levels of abstraction, to target AI Engine devices. This enables compilers and developers to program AI Engine cores, as well as describe data movements and array connectivity. A Python API is made available as a convenient interface for generating MLIR design descriptions. Backend code generation is also included, targeting the [aie-rt](https://github.com/Xilinx/aie-rt/tree/main-aie) library. This toolchain uses the AI Engine compiler tool which is part of the AMD Vitis™ software installation: these tools require a free license for use from the [Product Licensing Site](https://www.xilinx.com/member/forms/license-form.html). +This repository contains an [MLIR-based](https://mlir.llvm.org/) toolchain for AI Engine-enabled devices, such as [AMD Ryzen™ AI](https://www.amd.com/en/products/processors/consumer/ryzen-ai.html) and [Versal™](https://www.xilinx.com/products/technology/ai-engine.html). This repository can be used to generate low-level configurations for the AI Engine portion of these devices. AI Engines are organized as a spatial array of tiles, where each tile contains AI Engine cores and/or memories. The spatial array is connected by stream switches that can be configured to route data between AI Engine tiles scheduled by their programmable Data Movement Accelerators (DMAs). This repository contains MLIR representations, with multiple levels of abstraction, to target AI Engine devices. This enables compilers and developers to program AI Engine cores, as well as describe data movements and array connectivity. A Python API is made available as a convenient interface for generating MLIR design descriptions. Backend code generation is also included, targeting the [aie-rt](https://github.com/Xilinx/aie-rt/tree/main-aie) library. This toolchain uses the AI Engine compiler tool which is part of the AMD Vitis™ software installation: these tools require a free license for use from the [Product Licensing Site](https://www.xilinx.com/member/forms/license-form.html). This project is primarily intended to support the open-source community, particularly tool builders, with low-level access to AIE devices and enable the development of a wide variety of programming models from higher level abstractions. We provide an example programming flow: Interface Representation for hands-ON (IRON) close-to-metal programming of the AIE-array. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized designs through a set of Python language bindings around the mlir-aie dialect. As such, it contains some examples, however this project is not intended to represent an end-to-end compilation flow for all application designs. If you're looking for an out-of-the-box experience for highly efficient machine learning, check out the [AMD Ryzen™ AI Software Platform](https://github.com/amd/RyzenAI-SW/). diff --git a/programming_guide/section-1/README.md b/programming_guide/section-1/README.md index 04888e0f7c..06e8a72eee 100644 --- a/programming_guide/section-1/README.md +++ b/programming_guide/section-1/README.md @@ -25,7 +25,7 @@ Then we declare a structural design function that will expand into MLIR code whe def mlir_aie_design(): <... AI Engine device, blocks, and connections ...> ``` -Let's look at how we declare the AI Engine device, blocks, and connections. We start off by declaring our AIE device via `@device(AIEDevice.npu)` or `@device(AIEDevice.xcvc1902)`. The blocks and connections themselves will then be declared inside the `def device_body():`. Here, we instantiate our AI Engine blocks, which are AIE compute tiles in this first example. +Let's look at how we declare the AI Engine device, blocks, and connections. We start off by declaring our AIE device via `@device(AIEDevice.npu1_1col)` or `@device(AIEDevice.xcvc1902)`. The blocks and connections themselves will then be declared inside the `def device_body():`. Here, we instantiate our AI Engine blocks, which are AIE compute tiles in this first example. The arguments for the tile declaration are the tile coordinates (column, row). We assign each declared tile to a variable in our Python program. @@ -33,7 +33,7 @@ The arguments for the tile declaration are the tile coordinates (column, row). W ``` # Device declaration - here using aie2 device NPU - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): # Tile declarations @@ -54,7 +54,7 @@ Next to the compute tiles, an AIE-array also contains data movers for accessing ``` # Device declaration - here using aie2 device NPU - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): # Tile declarations @@ -78,7 +78,7 @@ Next to the compute tiles, an AIE-array also contains data movers for accessing Qualify the `print(ctx.module)` call with a check on `ctx.module.operation.verify()` using a code block like the following: ``` res = ctx.module.operation.verify() - if(res == True): + if res == True: print(ctx.module) else: print(res) diff --git a/programming_guide/section-2/section-2b/02_Broadcast/README.md b/programming_guide/section-2/section-2b/02_Broadcast/README.md index 1381619588..c4b721632a 100644 --- a/programming_guide/section-2/section-2b/02_Broadcast/README.md +++ b/programming_guide/section-2/section-2b/02_Broadcast/README.md @@ -14,7 +14,7 @@ As was explained in the Introduction [section](../../section-2a/README.md#initia -For more low-level details regarding how the objects in the Object FIFO are transferred via the AXI stream through the DMAs of the producer and consumer tiles please see the mlir-aie [tutorials](/mlir-aie/tutorials/tutorial-7/). They are, however, not required to understand or use the Object FIFO API. +For more low-level details regarding how the objects in the Object FIFO are transferred via the AXI stream through the DMAs of the producer and consumer tiles please see the mlir-aie [tutorials](/mlir_tutorials/tutorial-7/). They are, however, not required to understand or use the Object FIFO API. Below is an example of the Object FIFO `of0` shown in the previous figure. It has a depth of `3` with one producer tile A and three consumer tiles B, C and D: ```python diff --git a/programming_guide/section-2/section-2d/Makefile b/programming_guide/section-2/section-2d/Makefile new file mode 100644 index 0000000000..00beaa5818 --- /dev/null +++ b/programming_guide/section-2/section-2d/Makefile @@ -0,0 +1,24 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../programming_examples/makefile-common + +all: build/aie.mlir build/aiemulti.mlir + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/aiemulti.mlir: ${srcdir}/aie2_multi.py + mkdir -p ${@D} + python3 $< > $@ + +clean: + rm -rf build \ No newline at end of file diff --git a/programming_guide/section-2/section-2d/aie2.py b/programming_guide/section-2/section-2d/aie2.py index a523cb76ca..5aefaaf543 100644 --- a/programming_guide/section-2/section-2d/aie2.py +++ b/programming_guide/section-2/section-2d/aie2.py @@ -67,7 +67,11 @@ def core_body(): yield_([]) # Print the mlir conversion - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) # Call design function to generate mlir code to stdout diff --git a/programming_guide/section-2/section-2d/aie2_multi.py b/programming_guide/section-2/section-2d/aie2_multi.py index 4b7fee77d6..a9f2241a5c 100644 --- a/programming_guide/section-2/section-2d/aie2_multi.py +++ b/programming_guide/section-2/section-2d/aie2_multi.py @@ -97,7 +97,11 @@ def core_body(): yield_([]) # Print the mlir conversion - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) # Call design function to generate mlir code to stdout diff --git a/programming_guide/section-2/section-2d/run_makefile.lit b/programming_guide/section-2/section-2d/run_makefile.lit new file mode 100644 index 0000000000..fd436178e2 --- /dev/null +++ b/programming_guide/section-2/section-2d/run_makefile.lit @@ -0,0 +1,7 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/Makefile b/programming_guide/section-2/section-2e/01_single_double_buffer/Makefile new file mode 100644 index 0000000000..2787c509ca --- /dev/null +++ b/programming_guide/section-2/section-2e/01_single_double_buffer/Makefile @@ -0,0 +1,20 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../../programming_examples/makefile-common + +all: build/aie.mlir + +build/aie.mlir: ${srcdir}/single_buffer.py + mkdir -p ${@D} + python3 $< > $@ + +clean: + rm -rf build \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/run_makefile.lit b/programming_guide/section-2/section-2e/01_single_double_buffer/run_makefile.lit new file mode 100644 index 0000000000..fd436178e2 --- /dev/null +++ b/programming_guide/section-2/section-2e/01_single_double_buffer/run_makefile.lit @@ -0,0 +1,7 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py index 46a5f763a8..a68b397862 100644 --- a/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py +++ b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py @@ -15,7 +15,7 @@ def single_buffer(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_16_ty = T.memref(16, T.i32()) @@ -52,7 +52,11 @@ def core_body(): of_in.release(ObjectFifoPort.Consume, 1) yield_([]) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) single_buffer() diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile b/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile index db95484c3d..7b7616c946 100644 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile @@ -6,7 +6,9 @@ # ##===----------------------------------------------------------------------===## -include ../../../../programming_examples/makefile-common +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../../programming_examples/makefile-common targetname = ext_to_core devicename = npu @@ -14,7 +16,7 @@ col = 0 all: build/final.xclbin build/insts.txt -build/aie.mlir: ext_to_core.py +build/aie.mlir: ${srcdir}/ext_to_core.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ @@ -23,16 +25,16 @@ build/final.xclbin: build/aie.mlir cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) -${targetname}.exe: test.cpp +${targetname}.exe: ${srcdir}/test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ else cp _build/${targetname} $@ -endif +endif run: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py index b49d1ce5fd..6925e6bd2d 100644 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py @@ -15,7 +15,7 @@ def external_mem_to_core(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_24_ty = T.memref(24, T.i32()) @@ -62,7 +62,11 @@ def sequence(inTensor, notUsed, outTensor): ) npu_sync(column=0, row=0, direction=0, channel=0) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) external_mem_to_core() diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit b/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit deleted file mode 100644 index 543fdcc70b..0000000000 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai -// -// RUN: %python %S/ext_to_core.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir -// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// CHECK: PASS! diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/run_makefile.lit b/programming_guide/section-2/section-2e/02_external_mem_to_core/run_makefile.lit new file mode 100644 index 0000000000..a7b1bf34a4 --- /dev/null +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile + // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile index 82fdd0057d..d59c441166 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile @@ -6,7 +6,9 @@ # ##===----------------------------------------------------------------------===## -include ../../../../programming_examples/makefile-common +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../../programming_examples/makefile-common targetname = ext_to_core_L2 devicename = npu @@ -14,7 +16,7 @@ col = 0 all: build/final.xclbin build/insts.txt -build/aie.mlir: ext_to_core_L2.py +build/aie.mlir: ${srcdir}/ext_to_core_L2.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ @@ -23,16 +25,16 @@ build/final.xclbin: build/aie.mlir cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) -${targetname}.exe: test.cpp +${targetname}.exe: ${srcdir}/test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ else cp _build/${targetname} $@ -endif +endif run: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py index f97061615b..989808392c 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py @@ -15,7 +15,7 @@ def external_mem_to_core_L2(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_24_ty = T.memref(24, T.i32()) memRef_8_ty = T.memref(8, T.i32()) @@ -66,7 +66,11 @@ def sequence(inTensor, notUsed, outTensor): ) npu_sync(column=0, row=0, direction=0, channel=0) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) external_mem_to_core_L2() diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit deleted file mode 100644 index 937ed0dd9f..0000000000 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai -// -// RUN: %python %S/ext_to_core_L2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir -// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// CHECK: PASS! diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run_makefile.lit b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run_makefile.lit new file mode 100644 index 0000000000..a7b1bf34a4 --- /dev/null +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile + // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/CMakeLists.txt b/programming_guide/section-2/section-2e/04_distribute_L2/CMakeLists.txt new file mode 100644 index 0000000000..46627ab31a --- /dev/null +++ b/programming_guide/section-2/section-2e/04_distribute_L2/CMakeLists.txt @@ -0,0 +1,75 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName proj_${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS}s +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/Makefile b/programming_guide/section-2/section-2e/04_distribute_L2/Makefile new file mode 100644 index 0000000000..15801a682b --- /dev/null +++ b/programming_guide/section-2/section-2e/04_distribute_L2/Makefile @@ -0,0 +1,43 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../../programming_examples/makefile-common + +targetname = ext_to_core_L2 +devicename = npu +col = 0 + +all: build/final.xclbin build/insts.txt + +build/aie.mlir: ${srcdir}/distribute_L2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} > $@ + +build/final.xclbin: build/aie.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: ${srcdir}/test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +clean: + rm -rf build _build ${targetname}.exe \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py index a873be1655..4e7546b28d 100644 --- a/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py +++ b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py @@ -15,7 +15,7 @@ def distribute_L2(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_24_ty = T.memref(24, T.i32()) memRef_8_ty = T.memref(8, T.i32()) @@ -73,12 +73,16 @@ def core_body(): for i in for_(8): v0 = memref.load(elem, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) + memref.store(v1, elem, [i]) yield_([]) of_in2.release(ObjectFifoPort.Consume, 1) yield_([]) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) distribute_L2() diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/run_makefile.lit b/programming_guide/section-2/section-2e/04_distribute_L2/run_makefile.lit new file mode 100644 index 0000000000..a7b1bf34a4 --- /dev/null +++ b/programming_guide/section-2/section-2e/04_distribute_L2/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile + // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp b/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp new file mode 100644 index 0000000000..19936b2da5 --- /dev/null +++ b/programming_guide/section-2/section-2e/04_distribute_L2/test.cpp @@ -0,0 +1,190 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 48; +constexpr int OUT_SIZE = 48; + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + uint32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < 48; i++) { + uint32_t ref = 0; + if (*(bufOut + i) != ref) { + std::cout << "Error in output " << *(bufOut + i) << " != " << ref + << std::endl; + errors++; + } else { + std::cout << "Correct output " << *(bufOut + i) << " == " << ref + << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed.\n\n"; + return 1; + } +} \ No newline at end of file diff --git a/programming_guide/section-2/section-2e/05_join_L2/Makefile b/programming_guide/section-2/section-2e/05_join_L2/Makefile index 9de7d68f08..ae43cb353e 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/Makefile +++ b/programming_guide/section-2/section-2e/05_join_L2/Makefile @@ -6,7 +6,9 @@ # ##===----------------------------------------------------------------------===## -include ../../../../programming_examples/makefile-common +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../../programming_examples/makefile-common targetname = distribute_and_join_L2 devicename = npu @@ -14,7 +16,7 @@ col = 0 all: build/final.xclbin build/insts.txt -build/aie.mlir: distribute_and_join_L2.py +build/aie.mlir: ${srcdir}/distribute_and_join_L2.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ @@ -23,16 +25,16 @@ build/final.xclbin: build/aie.mlir cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) -${targetname}.exe: test.cpp +${targetname}.exe: ${srcdir}/test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ else cp _build/${targetname} $@ -endif +endif run: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py index 815334e5b4..b8c264ea28 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py +++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py @@ -15,7 +15,7 @@ def distribute_join_L2(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_24_ty = T.memref(24, T.i32()) memRef_8_ty = T.memref(8, T.i32()) diff --git a/programming_guide/section-2/section-2e/05_join_L2/join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py index e79cad4bf1..e91c4e6717 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/join_L2.py +++ b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py @@ -15,7 +15,7 @@ def join_L2(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_24_ty = T.memref(24, T.i32()) memRef_8_ty = T.memref(8, T.i32()) @@ -78,7 +78,11 @@ def core_body(): of_out2.release(ObjectFifoPort.Produce, 1) yield_([]) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) join_L2() diff --git a/programming_guide/section-2/section-2e/05_join_L2/run.lit b/programming_guide/section-2/section-2e/05_join_L2/run.lit deleted file mode 100644 index 34c7a2f9f7..0000000000 --- a/programming_guide/section-2/section-2e/05_join_L2/run.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai -// -// RUN: %python %S/distribute_and_join_L2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir -// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// CHECK: PASS! diff --git a/programming_guide/section-2/section-2e/05_join_L2/run_makefile.lit b/programming_guide/section-2/section-2e/05_join_L2/run_makefile.lit new file mode 100644 index 0000000000..a7b1bf34a4 --- /dev/null +++ b/programming_guide/section-2/section-2e/05_join_L2/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai, chess + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile + // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! \ No newline at end of file diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md index c2810b35e9..f87aa4f6c4 100644 --- a/programming_guide/section-3/README.md +++ b/programming_guide/section-3/README.md @@ -22,7 +22,7 @@ The host code can be written in either C++ (as shown in the figure) or in Python -Throughout this section, a [vector scalar multiplication](../../programming_examples/basic/vector_scalar_mul/) (c = a * factor) will be used as an example. Vector scalar multiplication takes an input vector `a` and computes the output vector `c` by multiplying each element of a with a factor. In this example, the total vector size is set to 4096 (16b) that will processed in chunks of 1024. +Throughout this section, a [vector scalar multiplication](../../programming_examples/basic/vector_scalar_mul/) (`c = a * factor`) will be used as an example. Vector scalar multiplication takes an input vector `a` and computes the output vector `c` by multiplying each element of `a` with a `factor`. In this example, the total vector size is set to 4096 (16b) that will processed in chunks of 1024. This design is also available in the [programming_examples](../../programming_examples) of this repository. We will first introduce the AIE-array structural description, review the kernel code and then introduce the host code. Finally we will show how to run the design on Ryzen™ AI enabled hardware. @@ -34,7 +34,7 @@ The [aie2.py](../../programming_examples/basic/vector_scalar_mul/aie2.py) AIE-ar ```python # Device declaration - here using aie2 device NPU -@device(AIEDevice.npu) +@device(AIEDevice.npu1_1col) def device_body(): # Tile declarations @@ -58,7 +58,7 @@ Since the compute core can only access L1 memory, input data needs to be explici -This enables looking at the data movement in the AIE-array from a logical view where we deploy 3 objectFIFOs: "of_in" to bring in the vector `a`, "of_factor" to bring in the scalar factor, and "of_out" to move the output vector `c`, all using shimDMA. Note that the objects for "of_in" and "of_out" are declared to have the `memRef_ty` type: 1024 int32 elements, while the factor is an object containing a single integer. All objectFIFOs are set up using a depth size of 2 to enable the concurrent execution to the Shim Tile and Compute Tile DMAs with the processing on the compute core. +This enables looking at the data movement in the AIE-array from a logical view where we deploy 3 objectFIFOs: `of_in` to bring in the vector `a`, `of_factor` to bring in the scalar factor, and `of_out` to move the output vector `c`, all using shimDMA. Note that the objects for `of_in` and `of_out` are declared to have the `memRef_ty` type: 1024 int32 elements, while the `factor` is an object containing a single integer. All objectFIFOs are set up using a depth size of 2 to enable the concurrent execution to the Shim Tile and Compute Tile DMAs with the processing on the compute core. ```python # AIE-array data movement with object fifos @@ -82,9 +82,9 @@ We also need to set up the data movement to/from the AIE-array: configure n-dime npu_sync(column=0, row=0, direction=0, channel=0) ``` -Finally, we need to configure how the compute core accesses the data moved to its L1 memory, in objectFIFO terminology: we need to program the acquire and release patterns of "of_in", "of_factor" and "of_out". Only a single factor is needed for the complete 4096 vector, while for every processing iteration on a sub-vector, we need to acquire an object of 1024 integers to read from "of_in" and one similar sized object from "of_out". Then we call our previously declared external function with the acquired objects as operands. After the vector scalar operation, we need to release both objects to their respective "of_in" and "of_out" objectFIFO. And finally after the 4 sub-vector iterations, we release the "of_factor" objectFIFO. +Finally, we need to configure how the compute core accesses the data moved to its L1 memory, in objectFIFO terminology: we need to program the acquire and release patterns of `of_in`, `of_factor` and `of_out`. Only a single factor is needed for the complete 4096 vector, while for every processing iteration on a sub-vector, we need to acquire an object of 1024 integers to read from `of_in` and one similar sized object from `of_out`. Then we call our previously declared external function with the acquired objects as operands. After the vector scalar operation, we need to release both objects to their respective `of_in` and `of_out` objectFIFO. Finally, after the 4 sub-vector iterations, we release the `of_factor` objectFIFO. -This access and execute pattern runs on the AIE compute core `ComputeTile2` and needs to get linked against the precompiled external function `"scale.o"`. We run this pattern in a very large loop to enable enqueuing multiple rounds of vector scalar multiply work from the host code. +This access and execute pattern runs on the AIE compute core `ComputeTile2` and needs to get linked against the precompiled external function `scale.o`. We run this pattern in a very large loop to enable enqueuing multiple rounds of vector scalar multiply work from the host code. ```python @core(ComputeTile2, "scale.o") @@ -123,7 +123,7 @@ Note that since the scalar factor is communicated through an object, it is provi ## Host Code -The host code acts as an environment setup and testbench for the Vector Scalar Multiplication design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and kick off the execution of the AIE design on the NPU. After running, it verifies the results and optionally outputs trace data (to be covered in [section-4c](../section-4/section-4c/). Both a C++ [test.cpp](./test.cpp) and Python [test.py](./test.py) variants of this code are available. +The host code acts as an environment setup and testbench for the Vector Scalar Multiplication design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and kick off the execution of the AIE design on the NPU. After running, it verifies the results and optionally outputs trace data (to be covered in [section-4c](../section-4/section-4c/)). Both C++ [test.cpp](./test.cpp) and Python [test.py](./test.py) variants of this code are available. For convenience, a set of test utilities support common elements of command line parsing, the XRT-based environment setup and testbench functionality: [test_utils.h](../../runtime_lib/test_lib/test_utils.h) or [test.py](../../python/utils/test.py). diff --git a/programming_guide/section-3/aie2.py b/programming_guide/section-3/aie2.py index 7716de709a..fc23ff6b17 100644 --- a/programming_guide/section-3/aie2.py +++ b/programming_guide/section-3/aie2.py @@ -18,7 +18,7 @@ def my_vector_scalar(): - @device(AIEDevice.npu1_4col) + @device(AIEDevice.npu1_1col) def device_body(): memRef_ty = T.memref(1024, T.i32()) diff --git a/programming_guide/section-4/section-4a/Makefile b/programming_guide/section-4/section-4a/Makefile index ad1a03468e..330041419f 100644 --- a/programming_guide/section-4/section-4a/Makefile +++ b/programming_guide/section-4/section-4a/Makefile @@ -48,7 +48,7 @@ run-10-warmup: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --iters 10 --warmup 4 run_py: build/final.xclbin build/insts.txt - ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + ${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE clean: rm -rf build _build ${targetname}.exe diff --git a/programming_guide/section-4/section-4a/README.md b/programming_guide/section-4/section-4a/README.md index 507dbb375d..49d19564e1 100644 --- a/programming_guide/section-4/section-4a/README.md +++ b/programming_guide/section-4/section-4a/README.md @@ -34,7 +34,7 @@ Adding the application timer is as simple as noting a start and stop time surrou This provides us with a good baseline for how long our accelerated kernel function takes. ## Multiple iterations -A timer for a single kernel function call is a useful starting point for understanding performance but there can be a lot of variability and overhead for a single call that is smoothed out when run multiple times. In order to benchmark the steady-state kernel run time, we can add code around our kernel call to execute multiple times and capture the minimium, maximize and average time that our kernel takes. +A timer for a single kernel function call is a useful starting point for understanding performance but there can be a lot of variability and overhead for a single call that is smoothed out when run multiple times. In order to benchmark the steady-state kernel run time, we can add code around our kernel call to execute multiple times and capture the minimium, maximum, and average time that our kernel takes. In our example [test.cpp](./test.cpp), we wrap our calls within a for loop (based on `num_iter` or number of iterations). diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py index 427ed5e99c..a9b4b70ab5 100644 --- a/programming_guide/section-4/section-4a/aie2.py +++ b/programming_guide/section-4/section-4a/aie2.py @@ -18,7 +18,7 @@ def my_vector_scalar(): - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_ty = T.memref(1024, T.i32()) diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile index cc9e5cf231..b9b5a335e9 100644 --- a/programming_guide/section-4/section-4b/Makefile +++ b/programming_guide/section-4/section-4b/Makefile @@ -44,7 +44,7 @@ run: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE run_py: build/final.xclbin build/insts.txt - ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + ${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE trace: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py index 9a445382b7..910d4b1a94 100644 --- a/programming_guide/section-4/section-4b/aie2.py +++ b/programming_guide/section-4/section-4b/aie2.py @@ -21,7 +21,7 @@ def my_vector_scalar(): enableTrace = True trace_size = 8192 - @device(AIEDevice.npu) + @device(AIEDevice.npu1_1col) def device_body(): memRef_ty = T.memref(1024, T.i32())