update programming_examples (#1316)

Co-authored-by: Philip James-Roxby <phil.jamesroxby@gmail.com> Co-authored-by: pjr <pjr@xilinx.com> Co-authored-by: Jack Lo <36210336+jackl-xilinx@users.noreply.github.com> Co-authored-by: Kristof Denolf <kristof.denolf@amd.com> Co-authored-by: Joseph Melber <jgmelber@gmail.com> Co-authored-by: Andra Bisca <andra.bisca@gmail.com> Co-authored-by: AndraBisca <andrab@amd.com>
Xilinx · Apr 19, 2024 · 6354de5 · 6354de5
1 parent 1e4da96
commit 6354de5
Show file tree

Hide file tree

Showing 19 changed files with 686 additions and 657 deletions.
diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc
@@ -0,0 +1,61 @@
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, const int N>
+void eltwise_mul(T_in *a, T_in *b, T_out *c) {
+  for (int i = 0; i < N; i++) {
+    c[i] = a[i] * b[i];
+  }
+}
+
+template <typename T_in, typename T_out, const int N>
+void eltwise_vmul(T_in *a, T_in *b, T_out *c) {
+
+  constexpr int vec_factor = 16;
+  event0();
+  T_in *__restrict pA1 = a;
+  T_in *__restrict pB1 = b;
+  T_out *__restrict pC1 = c;
+  const int F = N / vec_factor;
+  for (int i = 0; i < F; i++)
+    chess_prepare_for_pipelining chess_loop_range(16, ) {
+      aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+      pA1 += vec_factor;
+      aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
+      pB1 += vec_factor;
+      aie::vector<T_out, vec_factor> cout = aie::mul(A0, B0);
+      aie::store_v(pC1, cout);
+      pC1 += vec_factor;
+    }
+  event1();
+}
+
+extern "C" {
+
+void eltwise_mul_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) {
+  eltwise_mul<bfloat16, bfloat16, 1024>(a_in, b_in, c_out);
+}
+
+void eltwise_mul_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) {
+  eltwise_vmul<bfloat16, bfloat16, 1024>(a_in, b_in, c_out);
+}
+
+} // extern "C"
diff --git a/programming_examples/basic/passthrough_dmas/README.md b/programming_examples/basic/passthrough_dmas/README.md
@@ -0,0 +1,24 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Passthrough DMAs</ins>
+
+This reference design can be run on a RyzenAI NPU.
+
+In the [design](./aie2.py) data is brought from external memory to `ComputeTile2` and back, without modification from the tile, by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through Shim tile (`col`, 0).
+
+The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide.
+
+
+To compile and run the design for NPU:
+```
+make
+make run
+```
diff --git a/programming_examples/basic/vector_exp/README.md b/programming_examples/basic/vector_exp/README.md
@@ -8,11 +8,11 @@
 // 
 //===----------------------------------------------------------------------===//-->
 
-# Eltwise exp
+# Eltwise $e^x$
 
-This example shows how the look up table capability of the AIE can be used to perform approximations to well known functions like e^x. 
-This design uses 4 cores, and each core operates on `1024 bfloat16` numbers.  Each core contains a lookup table approximation of the e^x function, which is then used to perform the e^x operation.  
-e^x is typically used in machine learning applications with relatively small numbers, typically around 0..1, and also will return infinity for input values larger than 89, so a small look up table approximation method is often accurate enough compared to a more exact approximation like Taylor series expansion.
+This example shows how the look up table capability of the AIE can be used to perform approximations to well known functions like $e^x$. 
+This design uses 4 cores, and each core operates on `1024` `bfloat16` numbers.  Each core contains a lookup table approximation of the $e^x$ function, which is then used to perform the operation.  
+$e^x$ is typically used in machine learning applications with relatively small numbers, typically around 0..1, and also will return infinity for input values larger than 89, so a small look up table approximation method is often accurate enough compared to a more exact approximation like Taylor series expansion.
 
 ## Source Files Overview
 
@@ -22,6 +22,8 @@ e^x is typically used in machine learning applications with relatively small num
 
 1. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data.
 
+The design also uses a single file from the AIE runtime, in order to initialize the look up table contents to approximate the $e^x$ function.
+
 
 ## Usage
 

diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -48,6 +48,9 @@ endif
 run: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
+run-g: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t 8192
+
 trace:
 	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
 

diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -12,18 +12,7 @@
 from aie.dialects.scf import *
 from aie.extras.context import mlir_mod_ctx
 
-# Deciphering the command line arguments
-if len(sys.argv) < 3:
-    raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-
-if sys.argv[1] == "ipu":
-    dev = AIEDevice.ipu
-elif sys.argv[1] == "xcvc1902":
-    dev = AIEDevice.xcvc1902
-else:
-    raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-col = int(sys.argv[2])
+import aie.utils.trace as trace_utils
 
 
 def my_vector_scalar():
@@ -38,148 +27,74 @@ def my_vector_scalar():
     enable_tracing = False
     trace_size = 8192
 
-    if enable_tracing and sys.argv[1] == "xcvc1902":
-        raise ValueError(
-            "[ERROR] Trace is currently not supported with device xcvc1902"
-        )
-
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(n, T.i32())
+    @device(AIEDevice.ipu)
+    def device_body():
+        memRef_ty = T.memref(n, T.i32())
 
-            # AIE Core Function declarations
+        # AIE Core Function declarations
 
-            scale_scalar_int32 = external_func(
-                "scale_scalar_int32", inputs=[memRef_ty, memRef_ty]
-            )
-            scale_int32 = external_func("scale_int32", inputs=[memRef_ty, memRef_ty])
+        scale_scalar_int32 = external_func(
+            "scale_scalar_int32", inputs=[memRef_ty, memRef_ty]
+        )
+        scale_int32 = external_func("scale_int32", inputs=[memRef_ty, memRef_ty])
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        ComputeTile2 = tile(0, 2)
+
+        # AIE-array data movement with object fifos
+        of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+
+        # Set up a circuit-switched flow from core to shim for tracing information
+        if enable_tracing:
+            flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2, "scale.o")
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                for _ in for_(N_div_n):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    if vectorized:
+                        call(scale_int32, [elem_in, elem_out])
+                    else:
+                        call(scale_scalar_int32, [elem_in, elem_out])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+                yield_([])
 
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            compute_tile2_col, compute_tile2_row = col, 2
-            ComputeTile2 = tile(compute_tile2_col, compute_tile2_row)
+        # To/from AIE-array data movement
+        tensor_ty = T.memref(N, T.i32())
 
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
 
-            # Set up a circuit-switched flow from core to shim for tracing information
             if enable_tracing:
-                flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2, "scale.o")
-            def core_body():
-                # Effective while(1)
-                for _ in for_(sys.maxsize):
-                    # Number of sub-vector "tile" iterations
-                    for _ in for_(N_div_n):
-                        elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
-                        elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
-                        if vectorized:
-                            call(scale_int32, [elem_in, elem_out])
-                        else:
-                            call(scale_scalar_int32, [elem_in, elem_out])
-                        of_in.release(ObjectFifoPort.Consume, 1)
-                        of_out.release(ObjectFifoPort.Produce, 1)
-                        yield_([])
-                    yield_([])
-
-            # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(A, B, C):
-
-                # Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
-                if enable_tracing:
-                    # 0x340D0: Trace Control 0
-                    #          0xAABB---C
-                    #            AA        <- Event to stop trace capture
-                    #              BB      <- Event to start trace capture
-                    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
-                    # Configure so that "Event 1" (always true) causes tracing to start
-                    ipu_write32(
-                        column=compute_tile2_col,
-                        row=compute_tile2_row,
-                        address=0x340D0,
-                        value=0x00010000,
-                    )
-                    # 0x340D4: Trace Control 1
-                    ipu_write32(
-                        column=compute_tile2_col,
-                        row=compute_tile2_row,
-                        address=0x340D4,
-                        value=0x00000000,
-                    )
-                    # 0x340E0: Trace Event Group 1  (Which events to trace)
-                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
-                        column=compute_tile2_col,
-                        row=compute_tile2_row,
-                        address=0x340E0,
-                        value=0x4B222125,
-                    )
-                    # 0x340E4: Trace Event Group 2  (Which events to trace)
-                    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
-                        column=compute_tile2_col,
-                        row=compute_tile2_row,
-                        address=0x340E4,
-                        value=0x2D2C1A4F,
-                    )
-
-                    ipu_write32(
-                        column=compute_tile2_col,
-                        row=compute_tile2_row,
-                        address=0x3FF00,
-                        value=0x00000121,
-                    )
-
-                    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
-                    # out to host DDR memory
-                    trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
-                    output_size = N_in_bytes
-                    ipu_writebd_shimtile(
-                        bd_id=trace_bd_id,
-                        buffer_length=trace_size,
-                        buffer_offset=output_size,
-                        enable_packet=0,
-                        out_of_order_id=0,
-                        packet_id=0,
-                        packet_type=0,
-                        column=0,
-                        column_num=1,
-                        d0_size=0,
-                        d0_stride=0,
-                        d1_size=0,
-                        d1_stride=0,
-                        d2_stride=0,
-                        ddr_id=2,
-                        iteration_current=0,
-                        iteration_size=0,
-                        iteration_stride=0,
-                        lock_acq_enable=0,
-                        lock_acq_id=0,
-                        lock_acq_val=0,
-                        lock_rel_id=0,
-                        lock_rel_val=0,
-                        next_bd=0,
-                        use_next_bd=0,
-                        valid_bd=1,
-                    )
-                    # Set start BD to our shim bd_Id (3)
-                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
-
-                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                ipu_sync(column=0, row=0, direction=0, channel=0)
-
+                trace_utils.configure_simple_tracing_aie2(
+                    ComputeTile2,
+                    ShimTile,
+                    channel=1,
+                    bd_id=13,
+                    ddr_id=2,
+                    size=trace_size,
+                    offset=N_in_bytes,
+                    start=0x1,
+                    stop=0x0,
+                    events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F],
+                )
+
+            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            ipu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_scalar()
     print(ctx.module)
-
-
-my_vector_scalar()
diff --git a/programming_examples/basic/vector_scalar_mul/run.lit b/programming_examples/basic/vector_scalar_mul/run.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/scale.cc -o ./scale.o
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %python %S/aie2.py > ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
 // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!