Merge branch 'main' into tiler-helper

Xilinx · Oct 23, 2024 · 87df9a7 · 87df9a7
2 parents d51e5c8 + 655adb1
commit 87df9a7
Show file tree

Hide file tree

Showing 15 changed files with 41 additions and 30 deletions.
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/run_makefile.lit b/programming_examples/basic/matrix_multiplication/matrix_vector/run_makefile.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
+// RUN: mkdir -p test
+// RUN: cd test
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
 // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/run_makefile_chess.lit b/programming_examples/basic/matrix_multiplication/matrix_vector/run_makefile_chess.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess 
 //
+// RUN: mkdir -p test_chess
+// RUN: cd test_chess
 // RUN: make -f %S/Makefile.chess clean
 // RUN: make -f %S/Makefile.chess
 // RUN: %run_on_npu make -f %S/Makefile.chess run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/single_core/run_makefile.lit b/programming_examples/basic/matrix_multiplication/single_core/run_makefile.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_1
-// RUN: cd %S/test_1
+// RUN: mkdir -p test_1
+// RUN: cd test_1
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
 // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/single_core/run_makefile_1.lit b/programming_examples/basic/matrix_multiplication/single_core/run_makefile_1.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_2
-// RUN: cd %S/test_2
+// RUN: mkdir -p test_2
+// RUN: cd test_2
 // RUN: make -f %S/Makefile clean
 // RUN: env M=768 K=512 N=512 m=64 k=64 n=64 dtype_in=i16 dtype_out=i16 make -f %S/Makefile 
 // RUN: %run_on_npu env M=768 K=512 N=512 m=64 k=64 n=64 dtype_in=i16 dtype_out=i16 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/single_core/run_makefile_alt.lit b/programming_examples/basic/matrix_multiplication/single_core/run_makefile_alt.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_alt
-// RUN: cd %S/test_alt
+// RUN: mkdir -p test_alt
+// RUN: cd test_alt
 // RUN: make -f %S/Makefile clean
 // RUN: env use_alt=1 make -f %S/Makefile 
 // RUN: %run_on_npu env use_alt=1 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/single_core/run_makefile_chess.lit b/programming_examples/basic/matrix_multiplication/single_core/run_makefile_chess.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess 
 //
+// RUN: mkdir -p test_chess
+// RUN: cd test_chess
 // RUN: make -f %S/Makefile.chess clean
 // RUN: make -f %S/Makefile.chess
 // RUN: %run_on_npu make -f %S/Makefile.chess run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/single_core/run_makefile_i8.lit b/programming_examples/basic/matrix_multiplication/single_core/run_makefile_i8.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_i8
-// RUN: cd %S/test_i8
+// RUN: mkdir -p test_i8
+// RUN: cd test_i8
 // RUN: make -f %S/Makefile clean
 // RUN: env dtype_in=i8 dtype_out=i8 m=64 k=128 n=64 M=512 K=512 N=512 make -f %S/Makefile 
 // RUN: %run_on_npu env dtype_in=i8 dtype_out=i8 m=64 k=128 n=64 M=512 K=512 N=512 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_1_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_1_col.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_1_col
-// RUN: cd %S/test_1_col
+// RUN: mkdir -p test_1_col
+// RUN: cd test_1_col
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=1 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=2 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_2_col
-// RUN: cd %S/test_2_col
+// RUN: mkdir -p test_2_col
+// RUN: cd test_2_col
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=2 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=2 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_4_col
-// RUN: cd %S/test_4_col
+// RUN: mkdir -p test_4_col
+// RUN: cd test_4_col
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=4 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=4 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col_i8.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col_i8.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_4_col_i8
-// RUN: cd %S/test_4_col_i8
+// RUN: mkdir -p test_4_col_i8
+// RUN: cd test_4_col_i8
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=4 dtype_in=i8 dtype_out=i8 M=512 K=512 N=512 m=64 k=128 n=64 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=4 dtype_in=i8 dtype_out=i8 M=512 K=512 N=512 m=64 k=128 n=64 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_chess.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_chess.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess 
 //
+// RUN: mkdir -p test_chess
+// RUN: cd test_chess
 // RUN: make -f %S/Makefile.chess clean
 // RUN: make -f %S/Makefile.chess
 // RUN: %run_on_npu make -f %S/Makefile.chess run | FileCheck %s

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_col_maj.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_col_maj.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
-// RUN: mkdir -p %S/test_b_col_maj
-// RUN: cd %S/test_b_col_maj
+// RUN: mkdir -p test_b_col_maj
+// RUN: cd test_b_col_maj
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=4 b_col_maj=1 dtype_in=bf16 dtype_out=f32 M=256 K=256 N=256 m=32 k=32 n=32 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=4 b_col_maj=1 dtype_in=bf16 dtype_out=f32 M=256 K=256 N=256 m=32 k=32 n=32 make -f %S/Makefile run | FileCheck %s

diff --git a/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md b/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md
@@ -21,6 +21,8 @@ class object_fifo_link(ObjectFifoLinkOp):
         self,
         fifoIns,
         fifoOuts,
+        srcOffsets=[],
+        dstOffsets=[],
     )
 ```
 A link allows the user to specify a set of input Object FIFOs via the `fifoIns` input and a set of output ones via the `fifoOuts` input. Each Object FIFO may be specified either using its `name` or its Python object. Both inputs can be either a single Object FIFO or an array of them. It is required that there exists at least one shared tile between the consumer tiles of `fifoIns` and the producer tiles of `fifoOuts` for a link to be valid. This is because the implicit copy of data will be done using the Data Movement Accelerators (DMAs) of that tile.
@@ -47,16 +49,17 @@ Currently, the Object FIFO lowering uses the order in which the output FIFOs are
 
 <img src="./../../../assets/Distribute.png" height="200">
 
-The following code snippet describes the figure above. There are three Object FIFOs: `of0` has a producer tile A and a consumer tile B, while `of1` and `of2` have B as their producer tile and C and D respectively as their consumer tiles. The link specifies that data from `of0` is distributed to `of1` and `of2`. In this link, B is the shared tile where the implicit data copy will take place via B's DMAs. We can also note how `of1` and `of2`'s datatypes are half of `of0`'s, which means that the first half of objects in `of0` will go to `of1` and the second half to `of2`, based on their order in the link.
+The following code snippet describes the figure above. There are three Object FIFOs: `of0` has a producer tile A and a consumer tile B, while `of1` and `of2` have B as their producer tile and C and D respectively as their consumer tiles. The link specifies that data from `of0` is distributed to `of1` and `of2`. In this link, B is the shared tile where the implicit data copy will take place via B's DMAs. We can also note how `of1` and `of2`'s datatypes are half of `of0`'s, which means that the first half of objects in `of0` will go to `of1` and the second half to `of2`, based on their order in the link. This is explicitly set by specifying the `dstOffsets` option on the link.
+
 ```python
 A = tile(1, 0)
 B = tile(1, 1)
 C = tile(1, 3)
 D = tile(2, 3)
 of0 = object_fifo("objfifo0", A, B, 2, np.ndarray[(256,), np.dtype[np.int32]])
-of1 = object_fifo("objfifo1", B, C, 2, np.ndarray[(256,), np.dtype[np.int32]])
-of2 = object_fifo("objfifo2", B, D, 2, np.ndarray[(256,), np.dtype[np.int32]])
-object_fifo_link(of0, [of1, of2])
+of1 = object_fifo("objfifo1", B, C, 2, np.ndarray[(128,), np.dtype[np.int32]])
+of2 = object_fifo("objfifo2", B, D, 2, np.ndarray[(128,), np.dtype[np.int32]])
+object_fifo_link(of0, [of1, of2], [], [0, 128])
 ```
 
 A full design example that uses this feature is available in Section 2e: [04_distribute_L2](../../section-2e/04_distribute_L2/).
@@ -76,9 +79,9 @@ B = tile(1, 1)
 C = tile(1, 3)
 D = tile(2, 3)
 of0 = object_fifo("objfifo0", B, A, 2, np.ndarray[(256,), np.dtype[np.int32]])
-of1 = object_fifo("objfifo1", C, B, 2, np.ndarray[(256,), np.dtype[np.int32]])
-of2 = object_fifo("objfifo2", D, B, 2, np.ndarray[(256,), np.dtype[np.int32]])
-object_fifo_link([of1, of2], of0)
+of1 = object_fifo("objfifo1", C, B, 2, np.ndarray[(128,), np.dtype[np.int32]])
+of2 = object_fifo("objfifo2", D, B, 2, np.ndarray[(128,), np.dtype[np.int32]])
+object_fifo_link([of1, of2], of0, [0, 128], [])
 ```
 
 A full design example that uses these features is available in Section 2e: [05_join_L2](../../section-2e/05_join_L2/).

diff --git a/programming_guide/section-2/section-2f/README.md b/programming_guide/section-2/section-2f/README.md
@@ -55,7 +55,7 @@ tile_a = tile(1, 3)
 
 prod_lock = lock(tile_a, lock_id=0, init=1)
 cons_lock = lock(tile_a, lock_id=1, init=0)
-buff_in = buffer(tile=tile_a, shape=(256,), dtype=np.int32) # 256xi32
+buff_in = buffer(tile=tile_a, datatype=np.ndarray[(256,), np.dtype[np.int32]]) # 256xi32
 
 @mem(tile_a)
 def mem_body():
@@ -78,8 +78,8 @@ tile_a = tile(1, 3)
 
 prod_lock = lock(tile_a, lock_id=0, init=2) # note that the producer lock now has 2 tokens
 cons_lock = lock(tile_a, lock_id=1, init=0)
-buff_ping = buffer(tile=tile_a, shape=(256,), dtype=np.int32) # 256xi32
-buff_pong = buffer(tile=tile_a, shape=(256,), dtype=np.int32) # 256xi32
+buff_ping = buffer(tile=tile_a, datatype=np.ndarray[(256,), np.dtype[np.int32]]) # 256xi32
+buff_pong = buffer(tile=tile_a, datatype=np.ndarray[(256,), np.dtype[np.int32]]) # 256xi32
 
 @mem(tile_a)
 def mem_body():
@@ -130,11 +130,11 @@ tile_b = tile(1, 3)
 
 prod_lock_a = lock(tile_a, lock_id=0, init=1)
 cons_lock_a = lock(tile_a, lock_id=1, init=0)
-buff_a = buffer(tile=tile_a, shape=(256,), dtype=np.int32) # 256xi32
+buff_a = buffer(tile=tile_a, np.ndarray[(256,), np.dtype[np.int32]]) # 256xi32
 
 prod_lock_b = lock(tile_b, lock_id=0, init=1)
 cons_lock_b = lock(tile_b, lock_id=1, init=0)
-buff_b = buffer(tile=tile_b, shape=(256,), dtype=np.int32) # 256xi32
+buff_b = buffer(tile=tile_b, np.ndarray[(256,), np.dtype[np.int32]]) # 256xi32
 
 aie.flow(tile_a, WireBundle.DMA, 0, tile_b, WireBundle.DMA, 1)