Pjr docs (#1415)

Co-authored-by: pjr <pjr@xilinx.com> Co-authored-by: Kristof Denolf <kristof.denolf@amd.com> Co-authored-by: Joseph Melber <jgmelber@gmail.com>
Xilinx · Apr 25, 2024 · 312a989 · 312a989
1 parent 3929cec
commit 312a989
Show file tree

Hide file tree

Showing 11 changed files with 438 additions and 434 deletions.
diff --git a/programming_examples/basic/matrix_scalar_add/README.md b/programming_examples/basic/matrix_scalar_add/README.md
@@ -10,18 +10,50 @@
 
 # <ins>Matrix Scalar Addition</ins>
 
-A single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000.
+This design shows an extremely simple single AIE design, which is incrementing every value in an input matrix.
+
+It shows a number of features which can then be expanded to more realistic designs.  
+
+Firstly, a 2D DMA pattern is set up to access data from the input and output memories. Small `8x16` subtiles are accessed from the larger `16x128` input and output matrix.  Thinking about input and output spaces are large grids, with smaller grids of work being dispatched to individual AIE cores is a fundamental, reusable concept.
+
+Secondly, the design shows how the bodies of work done by each AIE core is a combination of data movement (the object FIFO acquire and releases) together with compute, which in this case is expressed using a number of different MLIR dialects, like arith, memref, etc. next to mlir-aie.
+
+Finally, the overall structural design shows how complete designs are a combination of a static design, consisting of cores, connections and some part of the data movement, together with a run time sequence for controlling the design.
+
+## Functionality
+
+A single AIE core performs a very simple `+` operation where the kernel loads data from its local memory, increments the value by `1` and stores it back to the local memory. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI NPU or a VCK5000.
+
+The kernel executes on AIE tile (`col`, 2) - this is actually the first core in a column, as the shim tile is on row 0, and the mem tile is on row 1. Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. 
+
+
+## Usage
+
+### NPU
+
+To compile the design and C++ testbench:
 
-The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` depends on whether the application is targeting NPU or VCK5000. The Shim tile is programmed with a 2D DMA to bring only a 2D submatrix into the AIE tile for processing. 
 
-To compile and run the design for NPU:
 ```
 make
+make matrixAddOne
+```
+
+To run the design:
+
+```
 make run
 ```
 
-To compile and run the design for VCK5000:
+### VCK5000
+
+To compile the design and C++ testbench:
 ```
 make vck5000
+```
+
+To run the design 
+
+```
 ./test.elf
 ```
diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -19,8 +19,8 @@
 IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT
 
 # Size of the tile we are processing
-TILE_WIDTH = 16
 TILE_HEIGHT = 8
+TILE_WIDTH = 16
 TILE_SIZE = TILE_WIDTH * TILE_HEIGHT
 
 NUM_3D = IMAGE_WIDTH / TILE_WIDTH
@@ -30,78 +30,76 @@
 
 
 def my_matrix_add_one():
-    with mlir_mod_ctx() as ctx:
-
-        if len(sys.argv) != 3:
-            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-
-        if sys.argv[1] == "npu":
-            dev = AIEDevice.npu
-        elif sys.argv[1] == "xcvc1902":
-            dev = AIEDevice.xcvc1902
-        else:
-            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(TILE_SIZE, T.i32())
-
-            # Tile declarations
-            ShimTile = tile(int(sys.argv[2]), 0)
-            ComputeTile2 = tile(int(sys.argv[2]), 2)
-
-            # AIE-array data movement with object fifos
-            # Input
-            of_in1 = object_fifo(
-                "in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty
-            )
 
-            # Output
-            of_out1 = object_fifo(
-                "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2)
-            def core_body():
-                # Effective while(1)
-                for _ in for_(8):
-                    elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
-                    for i in for_(TILE_SIZE):
-                        v0 = memref.load(elem_in, [i])
-                        v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                        memref.store(v1, elem_out, [i])
-                        yield_([])
-                    of_in1.release(ObjectFifoPort.Consume, 1)
-                    of_out1.release(ObjectFifoPort.Produce, 1)
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(TILE_SIZE, T.i32())
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        # Input
+        of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty)
+
+        # Output
+        of_out1 = object_fifo(
+            "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
+        )
+
+        # Set up compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+                for i in for_(TILE_SIZE):
+                    v0 = memref.load(elem_in, [i])
+                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
+                    memref.store(v1, elem_out, [i])
                     yield_([])
-
-            # To/from AIE-array data movement
-
-            tensor_ty = T.memref(TILE_SIZE, T.i32())
-
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(inTensor, notUsed, outTensor):
-                npu_dma_memcpy_nd(
-                    metadata="out0",
-                    bd_id=0,
-                    mem=outTensor,
-                    sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                    strides=[1, 1, IMAGE_WIDTH],
-                )
-                npu_dma_memcpy_nd(
-                    metadata="in0",
-                    bd_id=1,
-                    mem=inTensor,
-                    sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                    strides=[1, 1, IMAGE_WIDTH],
-                )
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
+                of_in1.release(ObjectFifoPort.Consume, 1)
+                of_out1.release(ObjectFifoPort.Produce, 1)
+                yield_([])
+
+        # To/from AIE-array data movement
+
+        tensor_ty = T.memref(TILE_SIZE, T.i32())
+
+        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(inTensor, notUsed, outTensor):
+            npu_dma_memcpy_nd(
+                metadata="out0",
+                bd_id=0,
+                mem=outTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH],
+            )
+            npu_dma_memcpy_nd(
+                metadata="in0",
+                bd_id=1,
+                mem=inTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH],
+            )
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
-my_matrix_add_one()
+with mlir_mod_ctx() as ctx:
+    my_matrix_add_one()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vector_scalar_add/README.md b/programming_examples/basic/vector_scalar_add/README.md
@@ -10,15 +10,26 @@
 
 # Vector Scalar Addition:
 
-Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back.
+This design shows an extremely simple single AIE design, which is incrementing every value in an input vector.
 
-The kernel executes on AIE tile (0, 2). Input data is brought to the local memory of the tile from the Shim tile (0, 0) through the Mem tile (0, 1). The size of the input data from the Shim tile is `16xi32`. The data is stored in the Mem tile and sent to the AIE tile in smaller pieces of size `8xi32`. Output data from the AIE tile to the Shim tile follows the same process, in reverse.
+It shows a number of features which can then be expanded to more realistic designs.  
+
+Firstly, a simple 1D DMA pattern is set up to access data from the input and output memories. Small `64` element subtiles are accessed from the larger `1024` element input and output vectors.  Thinking about input and output spaces are large grids, with smaller grids of work being dispatched to individual AIE cores is a fundamental, reusable concept.
+
+Secondly, these `64` element subtiles which are now in the mem tile are split into two smaller `32` element subtiles, and sent to the AIE engine to be processed.  This shows how the multi-level memory hierarchy of the NPU can be used.
+
+Thirdly, the design shows how the bodies of work done by each AIE core is a combination of data movement (the object FIFO acquire and releases) together with compute, which in this case is expressed using a number of different MLIR dialects, like `arith`, `memref`, etc. next to `mlir-aie`.
+
+Finally, the overall structural design shows how complete designs are a combination of a static design, consisting of cores, connections and some part of the data movement, together with a run time sequence for controlling the design.
+A single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back.
+
+The kernel executes on AIE tile (0, 2). Input data is first brought to the Me tile in (0, 1) using the Shim tile (0, 0). The size of the input data from the Shim tile is `64xi32`. The data is stored in the Mem tile and sent to the AIE tile in smaller pieces of size `32xi32`. Output data from the AIE tile to the Shim tile follows the same process, in reverse.
 
 This example does not contain a C++ kernel file. The kernel is expressed in Python bindings for the `memref` and `arith` dialects that is then compiled with the AIE compiler to generate the AIE core binary.
 
 ## Source Files Overview
 
-1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
 
 1. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
 

diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -11,13 +11,19 @@
 from aie.extras.dialects.ext import memref, arith
 from aie.extras.context import mlir_mod_ctx
 
+import sys
+
+PROBLEM_SIZE = 1024
+MEM_TILE_WIDTH = 64
+AIE_TILE_WIDTH = 32
+
 
 def my_vector_bias_add():
 
     @device(AIEDevice.npu)
     def device_body():
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_8_ty = T.memref(8, T.i32())
+        memRef_mem_tile_ty = T.memref(MEM_TILE_WIDTH, T.i32())
+        memRef_aie_tile_ty = T.memref(AIE_TILE_WIDTH, T.i32())
 
         # Tile declarations
         ShimTile = tile(0, 0)
@@ -26,13 +32,13 @@ def device_body():
 
         # AIE-array data movement with object fifos
         # Input
-        of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
-        of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
+        of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_mem_tile_ty)
+        of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_aie_tile_ty)
         object_fifo_link(of_in0, of_in1)
 
         # Output
-        of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
-        of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
+        of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_mem_tile_ty)
+        of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_aie_tile_ty)
         object_fifo_link(of_out1, of_out0)
 
         # Set up compute tiles
@@ -41,10 +47,10 @@ def device_body():
         @core(ComputeTile2)
         def core_body():
             # Effective while(1)
-            for _ in for_(8):
+            for _ in for_(sys.maxsize):
                 elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
                 elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
-                for i in for_(8):
+                for i in for_(AIE_TILE_WIDTH):
                     v0 = memref.load(elem_in, [i])
                     v1 = arith.addi(v0, arith.constant(1, T.i32()))
                     memref.store(v1, elem_out, [i])
@@ -54,17 +60,15 @@ def core_body():
                 yield_([])
 
         # To/from AIE-array data movement
+        tensor_ty = T.memref(PROBLEM_SIZE, T.i32())
 
-        memRef_64_ty = T.memref(64, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-
-        @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
-        def sequence(inTensor, notUsed, outTensor):
+        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
             npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
             npu_sync(column=0, row=0, direction=0, channel=0)