Xilinx · hunhoffe · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
@@ -44,5 +44,9 @@ endif
 run: ${targetname}.exe build/final.xclbin
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --M ${M} --K ${K}
 
+generate_access_map: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< --generate-access-map ${M} ${K}
+
 clean:
 	rm -rf build _build inst ${targetname}.exe
@@ -15,11 +15,24 @@ This reference design can be run on a Ryzen™ AI NPU.
 In the [design](./aie2.py), a 2-D array in a row-major layout is read from external memory to `ComputeTile2` with a transposed layout,
 by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through the Shim tile (`col`, 0).
 
+This data movement transformation can be visualized as a map which shows the order the data the data is streamed (e.g., in transposed layout):
+<p align="center">
+  <img
+    src="transpose_data.png">
+    <h3 align="center"> Visualization of the Transpose Data Transformation for M=32, K=16. 
+ </h3> 
+</p>
+
 The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide.
 
 
 To compile and run the design for NPU:
-```
+```bash
 make
 make run
+```
+
+To generate a data visualization of the transpose (like that above), run:
+```bash
+make generate_access_map
 ```
@@ -5,27 +5,32 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+import argparse
 import numpy as np
 import sys
 
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.extras.context import mlir_mod_ctx
 from aie.helpers.dialects.ext.scf import _for as range_
+from aie.helpers.tensortiler.tensortiler2d import TensorTile
 
-N = 4096
-M = 64
-K = 64
 
-if len(sys.argv) == 3:
-    M = int(sys.argv[1])
-    K = int(sys.argv[2])
-    N = M * K
+def my_passthrough(M, K, N, generate_acccess_map=False):
+    tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
+    data_transform = TensorTile(
+        tensor_height=M,
+        tensor_width=K,
+        sizes=[1, 1, K, M],
+        strides=[1, 1, 1, K],
+        offset=0,
+    )
+    if generate_acccess_map:
+        data_transform.visualize(
+            plot_access_count=False, file_path="transpose_data.png"
+        )
+        return
 
-tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
-
-
-def my_passthrough():
     with mlir_mod_ctx() as ctx:
 
         @device(AIEDevice.npu1_1col)
@@ -56,8 +61,7 @@ def sequence(A, B, C):
                     metadata=of_in,
                     bd_id=1,
                     mem=A,
-                    sizes=[1, 1, K, M],
-                    strides=[1, 1, 1, K],
+                    tensor_tile=data_transform,
                     issue_token=True,
                 )
                 npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
@@ -66,4 +70,24 @@ def sequence(A, B, C):
     print(ctx.module)
 
 
-my_passthrough()
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("dims", help="M K", type=int, nargs="*", default=[64, 64])
+    p.add_argument(
+        "--generate-access-map",
+        action="store_true",
+        help="Produce a file showing data access order",
+    )
+    args = p.parse_args()
+
+    if len(args.dims) != 2:
+        print(
+            "ERROR: Must provide either no dimensions or both M and K", file=sys.stderr
+        )
+        exit(-1)
+    my_passthrough(
+        M=args.dims[0],
+        K=args.dims[1],
+        N=args.dims[0] * args.dims[1],
+        generate_acccess_map=args.generate_access_map,
+    )
@@ -12,6 +12,7 @@
 from aie.dialects.aiex import *
 from aie.extras.context import mlir_mod_ctx
 from aie.helpers.dialects.ext.scf import _for as range_
+from aie.helpers.tensortiler.tensortiler2d import TensorTiler2D
 
 # Size of the entire image
 IMAGE_HEIGHT = 16
@@ -68,23 +69,24 @@ def core_body():
                 of_out1.release(ObjectFifoPort.Produce, 1)
 
         # To/from AIE-array data movement
+        tiler = TensorTiler2D(IMAGE_HEIGHT, IMAGE_WIDTH, TILE_HEIGHT, TILE_WIDTH)
+        t = next(tiler.tile_iter())  # Only transfer one (first) tile of data
+
         @runtime_sequence(tile_ty, tile_ty, tile_ty)
         def sequence(inTensor, notUsed, outTensor):
             npu_dma_memcpy_nd(
                 metadata=of_in1,
                 bd_id=1,
                 mem=inTensor,
-                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH, 1],
+                tensor_tile=t,
                 issue_token=True,
             )
 
             npu_dma_memcpy_nd(
                 metadata=of_out1,
                 bd_id=0,
                 mem=outTensor,
-                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH, 1],
+                tensor_tile=t,
             )
             dma_wait(of_in1, of_out1)
 

@@ -11,6 +11,7 @@
 from aie.dialects.aiex import *
 from aie.extras.context import mlir_mod_ctx
 from aie.helpers.dialects.ext.scf import _for as range_
+from aie.helpers.tensortiler.tensortiler2d import TensorTiler2D
 
 
 def row_wise_bias_add(M, N, m, n):
@@ -48,28 +49,32 @@ def core_body():
                         in_fifo.release(ObjectFifoPort.Consume, 1)
                     bias_fifo.release(ObjectFifoPort.Consume, 1)
 
+        tiler = TensorTiler2D(M, N, m, n, tensor_col_major=True)
+        t = next(
+            tiler.tile_iter(tile_group_height=M // m, tile_group_width=N // n)
+        )  # Transfer all tiles at once
+        bias_tiler = TensorTiler2D(1, N, 1, n)
+        bias_t = next(bias_tiler.tile_iter(tile_group_width=N // n))
+
         @runtime_sequence(tensor_ty, bias_ty, tensor_ty)
         def sequence(inp, bias, out):
             npu_dma_memcpy_nd(
                 metadata=in_fifo,
                 bd_id=0,
                 mem=inp,
-                sizes=[1, N // n, M, n],
-                strides=[0, n, N, 1],
+                tensor_tile=t,
             )
             npu_dma_memcpy_nd(
                 metadata=bias_fifo,
                 bd_id=1,
                 mem=bias,
-                sizes=[1, 1, N // n, n],
-                strides=[0, 0, n, 1],
+                tensor_tile=bias_t,
             )
             npu_dma_memcpy_nd(
                 metadata=out_fifo,
                 bd_id=2,
                 mem=out,
-                sizes=[1, N // n, M, n],
-                strides=[0, n, N, 1],
+                tensor_tile=t,
             )
             # of_out will only complete after of_in completes, so we just wait on of_out instead of both
             dma_wait(out_fifo)

@@ -0,0 +1,39 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include ${srcdir}/../../../makefile-common
+
+tensor_height = 32
+tensor_width = 32
+tile_height = 4
+tile_width = 4
+data_str=${tensor_height}_${tensor_width}_${tile_height}_${tile_width}
+
+.PHONY: all template clean
+
+all: build/final_${data_str}.xclbin
+
+build/aie_${data_str}.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} > $@
+
+build/final_${data_str}.xclbin: build/aie_${data_str}.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--no-xchesscc --no-xbridge \
+		--xclbin-name=${@F} --npu-insts-name=insts_${data_str}.txt $(<:%=../%)
+
+run: build/final_${data_str}.xclbin build/insts_${data_str}.txt
+	${powershell} python3 ${srcdir}/test.py -x build/final_${data_str}.xclbin -i build/insts_${data_str}.txt -k MLIR_AIE --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width}
+
+clean:
+	rm -rf build
@@ -0,0 +1,31 @@
+<!---//===- README.md -----------------------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# Tiling Exploration
+
+This IRON design flow example, called "Tiling Exploration", demonstrates how data may be `tiled` on input/output. This is a common data transformation pattern, and this example is meant to be interactive.
+
+## Source Files Overview
+
+TODO
+
+## Design Overview
+
+TODO
+
+## Design Component Details
+
+### AIE Array Structural Design
+
+TODO
+
+## Usage
+
+TODO
@@ -0,0 +1,88 @@
+# tiling_exploration/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+import argparse
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects import arith
+from aie.extras.context import mlir_mod_ctx
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.helpers.tensortiler.tensortiler2d import TensorTiler2D
+
+
+def generate_module(tensor_height, tensor_width, tile_height, tile_width):
+    @device(AIEDevice.npu1_1col)
+    def device_body():
+        # define types
+        tensor_size = tensor_height * tensor_width
+        tile_size = tile_height * tile_width
+        flattened_tensor = np.ndarray[(tensor_size,), np.dtype[TensorTiler2D.DTYPE]]
+        flattened_tile = np.ndarray[(tile_size,), np.dtype[TensorTiler2D.DTYPE]]
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        ComputeTile2 = tile(0, 2)
+
+        # AIE-array data movement with object fifos
+        of_out = object_fifo("out", ComputeTile2, ShimTile, 2, flattened_tile)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # TODO: better way to get mutable constant than buffer??
+            access_counter = buffer(
+                ComputeTile2,
+                np.ndarray[(1,), np.dtype[TensorTiler2D.DTYPE]],
+                "access_counter",
+                initial_value=np.array([0], dtype=TensorTiler2D.DTYPE),
+            )
+            for _ in range_(sys.maxsize):
+                elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                for i in range_(tile_size):
+                    elemOut[i] = access_counter[0]
+                    access_counter[0] += 1
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+        @runtime_sequence(flattened_tensor)
+        def sequence(access_count):
+            tiler = TensorTiler2D(tensor_height, tensor_width, tile_height, tile_width)
+            for t in tiler.tile_iter():
+                npu_dma_memcpy_nd(
+                    metadata=of_out,
+                    bd_id=1,
+                    mem=access_count,
+                    tensor_tile=t,
+                )
+                dma_wait(of_out)
+
+
+def main(opts):
+    with mlir_mod_ctx() as ctx:
+        generate_module(
+            opts.tensor_height, opts.tensor_width, opts.tile_height, opts.tile_width
+        )
+        print(ctx.module)
+
+
+def get_arg_parser():
+    p = argparse.ArgumentParser()
+    p.add_argument("--tensor-height", required=True, help="Tensor height", type=int)
+    p.add_argument("--tensor-width", required=True, help="Tensor width", type=int)
+    p.add_argument("--tile-height", required=True, help="Tile height", type=int)
+    p.add_argument("--tile-width", required=True, help="Tile width", type=int)
+    return p
+
+
+if __name__ == "__main__":
+    p = get_arg_parser()
+    opts = p.parse_args()
+    main(opts)
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano 
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: Running...
+// CHECK: PASS!