From 940f746e947e47a9beca9d081d1be75ebffaf941 Mon Sep 17 00:00:00 2001
From: Jack Lo <36210336+jackl-xilinx@users.noreply.github.com>
Date: Thu, 25 Apr 2024 22:11:37 -0700
Subject: [PATCH] Fixed matmul trace and added section to prog guide 4c (#1424)

Co-authored-by: Kristof Denolf <kristof.denolf@amd.com>
---
 .../basic/matrix_multiplication/makefile-common           | 8 ++++----
 .../basic/matrix_multiplication/single_core/aie2.py       | 4 ++--
 programming_guide/section-4/section-4c/README.md          | 4 ++++
 3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 7f361e3022..dba7e6c221 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -39,7 +39,7 @@ M?=512
 K?=512
 N?=512
 
-trace_size=16384
+trace_size=65536
 
 mlir_target?=build/aie_${M}x${K}x${N}.mlir
 xclbin_target?=build/final_${M}x${K}x${N}.xclbin
@@ -89,11 +89,11 @@ run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign
 trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign
 	export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
 	${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
-	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json
+	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
 
 .PHONY: parse_trace
 parse_trace:
-	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json
+	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
 
 .PHONY: clean
 clean: clean_trace
@@ -101,4 +101,4 @@ clean: clean_trace
 
 .PHONY: clean_trace
 clean_trace:
-	rm -rf tmpTrace parse*.json  trace.txt
+	rm -rf tmpTrace parse*.json trace*json trace.txt
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index e00534e708..54a391dc57 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -26,8 +26,8 @@ def my_matmul():
     word_size_out = 2
 
     vectorized = True
-    enable_tracing = False
-    trace_size = 16384
+    enable_tracing = True
+    trace_size = 65536
 
     A_sz_in_i32s = M * K * word_size_in // 4
     B_sz_in_i32s = K * N * word_size_in // 4
diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md
index e96cd8a7b4..c65b44470e 100644
--- a/programming_guide/section-4/section-4c/README.md
+++ b/programming_guide/section-4/section-4c/README.md
@@ -206,6 +206,10 @@ Looking at this table, we quickly see that the data movement is the bottleneck f
 
     Mouse over the blocks of PortRuning0 and PortRunning1, what is the measured number of cycles per chunk? <img src="../../../mlir_tutorials/images/answer1.jpg" title="512 cycles" height=25> This matches what we expected to see. But note how it's obvious from the waveform how dominant data movement is as compared to compute. 
 
+1. We can already see that our design is inbalanced between data movement and compute where we have 72 cycles for compute and 512 cycles for data movement. Let's take a look at the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and see if we can do better. In the description, it talks about each iteration of the kernel is by default configured for MxKxN values of 64x64x64 giving us 262,144 MACs. Given that we're working with `int16_t` datatype which has 64 MACs per clock, how many cycles will the ideal case take?  <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 262,144/ 64" height=25> Given that the A and B matrix are each 64x64 x `int16_t` and our stream switch channels are are 32-bits wide, how many cycles does it take to move data to the compute tile (bear in mind A and B can be moved in parallel via separate channels). <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 64x64/2" height=25>
+
+1. So this example should be perfectly balanced between compute and data movement! Navigate to the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and run the trace build (`make clean; make trace`). Then open the generated waveform json (`trace_mm.json`) and measure the delta between `event 0` and `event 1` in the first run. What value did you get and how close is it to ideal? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~2535 cycles which is 80% of 2048" height=25> You should now see that both the compute cycles and the data movement cycles are much more closely matched!
+
 ## <u>Diving Deep - Examining the Microcode</u>
 Let's take another look at the results of our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/). Let's also go back one step and comment out `chess_prepare_for_pipelining chess_loop_range(16, )` and rerun the compilation (`make clean; make trace`).