From 91eea4acda595a15b7c8813ea5eace5d94718436 Mon Sep 17 00:00:00 2001
From: Harsh Menon <harsh@nod-labs.com>
Date: Fri, 16 Aug 2024 13:37:15 -0700
Subject: [PATCH 1/2] Add lowerings for mma, register and allocate

This PR adds a mma unit test which lowers to
vector.loads/stores and amdgpu.mfmas. Also supports
shared memory promotion.

Signed-off-by: Harsh Menon <harsh@nod-labs.com>
---
 lit_tests/kernel/wave/codegen.py              |  98 +++++++++++++++++
 lit_tests/kernel/wave/expansion.py            |  20 ++--
 lit_tests/kernel/wave/promotion.py            |  14 +--
 shark_turbine/kernel/compiler/ir.py           |   4 +-
 .../kernel/compiler/vector_codegen.py         |  11 +-
 shark_turbine/kernel/lang/wave_types.py       |   3 +-
 shark_turbine/kernel/ops/wave_ops.py          |  28 +++--
 shark_turbine/kernel/wave/codegen.py          | 100 ++++++++++++++++--
 shark_turbine/kernel/wave/constraints.py      |  16 +++
 shark_turbine/kernel/wave/expansion.py        |   8 +-
 shark_turbine/kernel/wave/promotion.py        |  42 +++++---
 .../kernel/wave/register_analysis.py          |  32 ++++++
 shark_turbine/kernel/wave/utils.py            |  35 ++++++
 shark_turbine/kernel/wave/wave.py             |  12 ++-
 tests/kernel/wave/wave_gemm_test.py           |   3 +-
 15 files changed, 370 insertions(+), 56 deletions(-)
 create mode 100644 shark_turbine/kernel/wave/register_analysis.py
 create mode 100644 shark_turbine/kernel/wave/utils.py

diff --git a/lit_tests/kernel/wave/codegen.py b/lit_tests/kernel/wave/codegen.py
index 548eba96..6c80e55b 100644
--- a/lit_tests/kernel/wave/codegen.py
+++ b/lit_tests/kernel/wave/codegen.py
@@ -5,6 +5,7 @@
 import shark_turbine.kernel as tk
 import shark_turbine.kernel.lang as tkl
 import shark_turbine.kernel.wave as tkw
+from shark_turbine.kernel.lang.global_symbols import *
 import torch
 
 M = tkl.sym.M
@@ -13,7 +14,10 @@
 BLOCK_M = tkl.sym.BLOCK_M
 BLOCK_N = tkl.sym.BLOCK_N
 BLOCK_K = tkl.sym.BLOCK_K
+LOAD_ELEMS_PER_THREAD = tkl.sym.LOAD_ELEM_PER_THREAD
+STORE_ELEMS_PER_THREAD = tkl.sym.STORE_ELEM_PER_THREAD
 ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+ADDRESS_SPACE_0 = tkl.sym.ADDRESS_SPACE_0
 
 
 def launch(func: Callable[[], None]) -> Callable[[], None]:
@@ -247,6 +251,100 @@ def test(
         # CHECK: vector.scatter %[[OUT]][%[[IDX_Y]], %[[IDX_X]]] [%[[OFF]]], %[[MASK]], %[[RES]] : memref<16x16xf16, strided<[16, 1], offset: ?>>, vector<16xindex>, vector<16xi1>, vector<16xf16>
 
 
+@run
+def test_mma():
+    constraints: list[tkw.Constraint] = [tkw.WorkgroupConstraint(M, BLOCK_M, 0)]
+    constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
+    constraints += [tkw.WaveConstraint(M, BLOCK_M / 2)]
+    constraints += [tkw.WaveConstraint(N, BLOCK_N / 2)]
+
+    constraints += [
+        tkw.HardwareConstraint(
+            threads_per_wave=64,
+            waves_per_block=(2, 2, 1),
+            mma_type=tkw.MMAType.F32_16x16x16_F16,
+        )
+    ]
+
+    @tkw.wave(constraints)
+    def mma(
+        a: tkl.Memory[M, K, ADDRESS_SPACE, tkl.f16],
+        b: tkl.Memory[N, K, ADDRESS_SPACE, tkl.f16],
+        c: tkl.Memory[M, N, ADDRESS_SPACE_0, tkl.f32],
+    ):
+        c_reg = tkl.Register[M, N, tkl.f32](0.0)
+        a_reg = tkw.read(a, elements_per_thread=LOAD_ELEMS_PER_THREAD)
+        b_reg = tkw.read(b, elements_per_thread=LOAD_ELEMS_PER_THREAD)
+        acc = tkw.mma(a_reg, b_reg, c_reg)
+        tkw.write(acc, c, elements_per_thread=STORE_ELEMS_PER_THREAD)
+
+    with tk.gen.TestLaunchContext(
+        {
+            M: 64,
+            N: 128,
+            K: 16,
+            BLOCK_M: 32,
+            BLOCK_N: 32,
+            LOAD_ELEMS_PER_THREAD: 4,
+            STORE_ELEMS_PER_THREAD: 4,
+            ADDRESS_SPACE: SHARED_ADDRESS_SPACE,
+            ADDRESS_SPACE_0: GLOBAL_ADDRESS_SPACE,
+        }
+    ):
+        a = torch.randn(64, 32, dtype=torch.float16)
+        b = torch.randn(128, 32, dtype=torch.float16)
+        c = torch.zeros(64, 128, dtype=torch.float32)
+        print(mma(a, b, c, canonicalize=True).module_op)
+
+        # CHECK: func.func @mma(%[[ARG0:.+]]: !stream.binding, %[[ARG1:.+]]: !stream.binding, %[[ARG2:.+]]: !stream.binding) {
+        # CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
+        # CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
+        # CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
+        # CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+        # CHECK-DAG: %[[ACC:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
+        # CHECK: %[[WG0:.+]] = stream.dispatch.workgroup.id[0] : index
+        # CHECK: %[[WG1:.+]] = stream.dispatch.workgroup.id[1] : index
+        # CHECK: %[[TX:.+]] = gpu.thread_id  x
+        # CHECK: %[[TY:.+]] = gpu.thread_id  y
+        # CHECK: %[[R0:.+]] = stream.binding.subspan %[[ARG0]][%[[C0]]] : !stream.binding -> memref<64x16xf16, strided<[16, 1], offset: ?>>
+        # CHECK: %[[R1:.+]] = arith.muli %[[WG0]], %[[C32]] : index
+        # CHECK: %[[R2:.+]] = arith.divsi %[[TX]], %[[C4]] : index
+        # CHECK: %[[R3:.+]] = arith.addi %[[R2]], %[[R1]] : index
+        # CHECK: %[[R4:.+]] = vector.load %0[%[[R3]], %[[C0]]] : memref<64x16xf16, strided<[16, 1], offset: ?>>, vector<4xf16>
+        # CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<32x16xf16, #gpu.address_space<workgroup>>
+        # CHECK: %[[R5:.+]] = arith.muli %[[WG0]], %[[C32]] : index
+        # CHECK: %[[R6:.+]] = arith.divsi %[[TX]], %[[C4]] : index
+        # CHECK: %[[R7:.+]] = arith.addi %[[R6]], %[[R5]] : index
+        # CHECK: vector.store %4, %[[ALLOC]][%[[R7]], %[[C0]]] : memref<32x16xf16, #gpu.address_space<workgroup>>, vector<4xf16>
+        # CHECK: %[[R8:.+]] = arith.muli %[[WG0]], %[[C32]] : index
+        # CHECK: %[[R9:.+]] = arith.divsi %[[TX]], %[[C4]] : index
+        # CHECK: %[[R10:.+]] = arith.addi %[[R9]], %[[R8]] : index
+        # CHECK: %[[R11:.+]] = vector.load %[[ALLOC]][%[[R10]], %[[C0]]] : memref<32x16xf16, #gpu.address_space<workgroup>>, vector<4xf16>
+        # CHECK: %[[R12:.+]] = stream.binding.subspan %[[ARG1]][%[[C0]]] : !stream.binding -> memref<128x16xf16, strided<[16, 1], offset: ?>>
+        # CHECK: %[[R13:.+]] = arith.muli %[[TY]], %[[C16]] : index
+        # CHECK: %[[R14:.+]] = arith.muli %[[WG1]], %[[C32]] : index
+        # CHECK: %[[R15:.+]] = arith.addi %[[R14]], %[[R13]] : index
+        # CHECK: %[[R16:.+]] = vector.load %[[R12]][%[[R15]], %[[C0]]] : memref<128x16xf16, strided<[16, 1], offset: ?>>, vector<4xf16>
+        # CHECK: %[[ALLOC_0:.+]] = memref.alloc() : memref<32x16xf16, #gpu.address_space<workgroup>>
+        # CHECK: %[[R17:.+]] = arith.muli %[[TY]], %[[C16]] : index
+        # CHECK: %[[R18:.+]] = arith.muli %[[WG1]], %[[C32]] : index
+        # CHECK: %[[R19:.+]] = arith.addi %[[R18]], %[[R17]] : index
+        # CHECK: vector.store %16, %[[ALLOC_0]][%[[R19]], %[[C0]]] : memref<32x16xf16, #gpu.address_space<workgroup>>, vector<4xf16>
+        # CHECK: %[[R20:.+]] = arith.muli %[[TY]], %[[C16]] : index
+        # CHECK: %[[R21:.+]] = arith.muli %[[WG1]], %[[C32]] : index
+        # CHECK: %[[R22:.+]] = arith.addi %[[R21]], %[[R20]] : index
+        # CHECK: %[[R23:.+]] = vector.load %[[ALLOC_0]][%[[R22]], %[[C0]]] : memref<32x16xf16, #gpu.address_space<workgroup>>, vector<4xf16>
+        # CHECK: %[[R24:.+]] = amdgpu.mfma %[[R11]] * %[[R23]] + %[[ACC]] {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
+        # CHECK: %[[R25:.+]] = stream.binding.subspan %[[ARG2]][%[[C0]]] : !stream.binding -> memref<64x128xf32, strided<[128, 1], offset: ?>>
+        # CHECK: %[[R26:.+]] = arith.muli %[[WG0]], %[[C32]] : index
+        # CHECK: %[[R27:.+]] = arith.divsi %[[TX]], %[[C4]] : index
+        # CHECK: %[[R28:.+]] = arith.addi %[[R27]], %[[R26]] : index
+        # CHECK: %[[R29:.+]] = arith.muli %[[TY]], %[[C16]] : index
+        # CHECK: %[[R30:.+]] = arith.muli %[[WG1]], %[[C32]] : index
+        # CHECK: %[[R31:.+]] = arith.addi %[[R30]], %[[R29]] : index
+        # CHECK: vector.store %[[R24]], %[[R25]][%[[R28]], %[[R31]]] : memref<64x128xf32, strided<[128, 1], offset: ?>>, vector<4xf32>
+
+
 @run
 def test_add_float():
     constraints: list[tkw.Constraint] = [
diff --git a/lit_tests/kernel/wave/expansion.py b/lit_tests/kernel/wave/expansion.py
index a2a5429c..c8651f7c 100644
--- a/lit_tests/kernel/wave/expansion.py
+++ b/lit_tests/kernel/wave/expansion.py
@@ -350,25 +350,25 @@ def test_gemm():
         # CHECK-SAME: acc=acc_0_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_0_0 (index = None))
+        # CHECK-SAME: acc=mma_0_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_1_0_0 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_0 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: acc=acc_1_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) + 16 : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_1_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_1_1_0 (index = None))
+        # CHECK-SAME: acc=mma_1_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) + 16 : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_1_0_0 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_0 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: acc=acc_1_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) + 16 : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_1_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_1_0_0 (index = None))
+        # CHECK-SAME: acc=mma_1_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) + 16 : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_0 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_0 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: acc=acc_0_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_0_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_1_0 (index = None))
+        # CHECK-SAME: acc=mma_0_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: output(return_vals=([mma_0_0_1, mma_1_1_1, mma_1_0_1, mma_0_1_1],))
 
         # CHECK-NEXT: -----
@@ -502,25 +502,25 @@ def test_gemm_reduction_expansion_only():
         # CHECK-SAME: acc=acc_0_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_0_0 (index = None))
+        # CHECK-SAME: acc=mma_0_0_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_2 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 32 : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_2 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 32 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_0_1 (index = None))
+        # CHECK-SAME: acc=mma_0_0_1 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_3 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 48 : 4 : 1])
         # CHECK-SAME: rhs=read_0_0_3 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 48 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_0_2 (index = None))
+        # CHECK-SAME: acc=mma_0_0_2 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16)]))
         # CHECK-NEXT: mma(lhs=read_0_0_0 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_0 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) : 4 : 1])
         # CHECK-SAME: acc=acc_0_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_0_0_1 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_1 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 16 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_1_0 (index = None))
+        # CHECK-SAME: acc=mma_0_1_0 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_0_0_2 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 32 : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_2 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 32 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_1_1 (index = None))
+        # CHECK-SAME: acc=mma_0_1_1 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: mma(lhs=read_0_0_3 (index = [$T0*BLOCK_M/128 + $WG0*BLOCK_M + Mod($T0, 16), 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 48 : 4 : 1])
         # CHECK-SAME: rhs=read_0_1_3 (index = [$T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16, 16*$T1 + 16*$T2 + ARGK*BLOCK_K + 4*floor($T0/16) + 48 : 4 : 1])
-        # CHECK-SAME: acc=mma_0_1_2 (index = None))
+        # CHECK-SAME: acc=mma_0_1_2 (index = [$T0*BLOCK_M/128 + 16*$T1 + 16*$T2 + $WG0*BLOCK_M + 4*floor($T0/16) : 4 : 16, $T1*BLOCK_N/2 + $WG1*BLOCK_N + Mod($T0, 16) + 16]))
         # CHECK-NEXT: output(return_vals=([mma_0_0_3, mma_0_1_3],))
 
         # CHECK-NEXT: -----
diff --git a/lit_tests/kernel/wave/promotion.py b/lit_tests/kernel/wave/promotion.py
index b46f912c..34b96687 100644
--- a/lit_tests/kernel/wave/promotion.py
+++ b/lit_tests/kernel/wave/promotion.py
@@ -87,14 +87,14 @@ def test_read_write_equal_sizes():
         graph: fx.Graph = trace.get_root_graph()
         read_node = get_read_nodes(graph)[0]
         IndexingContext.current().finalize()
-        promote_node(read_node, SHARED_ADDRESS_SPACE)
+        promote_node(read_node, SHARED_ADDRESS_SPACE, constraints)
         print_trace(trace)
         # CHECK: %a
         # CHECK-NEXT: %c
         # CHECK-NEXT: %read
         # CHECK-SAME: (%a, 4, None)
         # CHECK-NEXT: %allocate
-        # CHECK-SAME: ((M, N), f16, $SHARED_ADDRESS_SPACE)
+        # CHECK-SAME: ((M, N), (BLOCK_M, BLOCK_N), f16, $SHARED_ADDRESS_SPACE)
         # CHECK-NEXT: %write_1
         # CHECK-SAME: (%read, %allocate, 4, None)
         # CHECK-NEXT: %read_1
@@ -136,14 +136,14 @@ def test_read_write_equal_sizes_different_address_spaces():
     ):
         trace: CapturedTrace = read_write_same_size_different_address_spaces()
         IndexingContext.current().finalize()
-        promote_placeholders(trace)
+        promote_placeholders(trace, constraints)
         print_trace(trace)
         # CHECK: %a
         # CHECK-NEXT: %c
         # CHECK-NEXT: %read
         # CHECK-SAME: (%a, 4, None)
         # CHECK-NEXT: %allocate
-        # CHECK-SAME: ((M, N), f16, $SHARED_ADDRESS_SPACE)
+        # CHECK-SAME: ((M, N), (BLOCK_M, BLOCK_N), f16, $SHARED_ADDRESS_SPACE)
         # CHECK-NEXT: %write_1
         # CHECK-SAME: (%read, %allocate, 4, None)
         # CHECK-NEXT: %read_1
@@ -191,7 +191,7 @@ def test_gemm():
         graph: fx.Graph = trace.get_subgraph("region_0")
         read_nodes = get_read_nodes(graph)
         for read_node in read_nodes:
-            promote_node(read_node, SHARED_ADDRESS_SPACE)
+            promote_node(read_node, SHARED_ADDRESS_SPACE, constraints)
         hoist_allocs(trace)
         IndexingContext.current().finalize()
         print_trace(trace)
@@ -201,9 +201,9 @@ def test_gemm():
         # CHECK-NEXT: %c
         # CHECK-NEXT: %register
         # CHECK-NEXT: %allocate
-        # CHECK-SAME: ((M, K), f16, $SHARED_ADDRESS_SPACE)
+        # CHECK-SAME: ((M, K), (BLOCK_M, BLOCK_K), f16, $SHARED_ADDRESS_SPACE)
         # CHECK-NEXT: %allocate_1
-        # CHECK-SAME: ((N, K), f16, $SHARED_ADDRESS_SPACE)
+        # CHECK-SAME: ((N, K), (BLOCK_N, BLOCK_K), f16, $SHARED_ADDRESS_SPACE)
         # CHECK-NEXT: reduction
         # CHECK-NEXT: %write
         # CHECK-SAME: (%reduction, %c, 4, None)
diff --git a/shark_turbine/kernel/compiler/ir.py b/shark_turbine/kernel/compiler/ir.py
index 48d87197..5ede83fa 100644
--- a/shark_turbine/kernel/compiler/ir.py
+++ b/shark_turbine/kernel/compiler/ir.py
@@ -35,6 +35,7 @@
 
 from iree.compiler.dialects import (
     arith as arith_d,
+    amdgpu as amdgpu_d,
     builtin as builtin_d,
     flow as flow_d,
     func as func_d,
@@ -42,6 +43,7 @@
     math as math_d,
     memref as memref_d,
     stream as stream_d,
-    vector as vector_d,
     scf as scf_d,
+    transform as transform_d,
+    vector as vector_d,
 )
diff --git a/shark_turbine/kernel/compiler/vector_codegen.py b/shark_turbine/kernel/compiler/vector_codegen.py
index 775828a1..98eaf1fd 100644
--- a/shark_turbine/kernel/compiler/vector_codegen.py
+++ b/shark_turbine/kernel/compiler/vector_codegen.py
@@ -790,7 +790,11 @@ def cast_py_value(emitter: ThreadEmitter, value) -> IRProxyValue:
         try:
             node_values = emitter.lookup_node_values(value)
             assert len(node_values) == 1, f"Expected exactly one value for node {value}"
-            return node_values[0]
+            return (
+                node_values[0]
+                if isinstance(node_values[0], IRProxyValue)
+                else IRProxyValue(node_values[0])
+            )
         except KeyError:
             raise CodegenError(f"Producer node `{value}` has no IR Value")
     elif isinstance(value, IndexExpr):
@@ -828,6 +832,11 @@ def cast_kernel_buffer(
     value, node = cast_py_lvalue(emitter, kb)
     ir_type = value.type
     py_type = node.type
+    if py_type is None:
+        try:
+            py_type = ops.wave_ops.get_custom(node).type
+        except:
+            raise CodegenError(f"Could not find type for node {node}")
 
     if not MemRefType.isinstance(ir_type):
         raise CodegenError(
diff --git a/shark_turbine/kernel/lang/wave_types.py b/shark_turbine/kernel/lang/wave_types.py
index d4ffc4bb..c84bdbdd 100644
--- a/shark_turbine/kernel/lang/wave_types.py
+++ b/shark_turbine/kernel/lang/wave_types.py
@@ -11,7 +11,6 @@
 )
 
 from .kernel_buffer import AddressSpace, KernelBufferMeta, KernelBufferUsage
-from ..ops.wave_ops import register
 from .._support.dtype import DataType
 from .._support.indexing import IndexExpr, IndexSymbol, index_symbol
 
@@ -101,6 +100,8 @@ class Register(metaclass=KernelBufferMeta):
     value: float
 
     def __new__(cls, value: float) -> "Register":
+        from ..ops.wave_ops import register
+
         return register(cls.symbolic_shape, cls.dtype, value)
 
     def __class_getitem__(
diff --git a/shark_turbine/kernel/ops/wave_ops.py b/shark_turbine/kernel/ops/wave_ops.py
index 603f367d..be75e2e4 100644
--- a/shark_turbine/kernel/ops/wave_ops.py
+++ b/shark_turbine/kernel/ops/wave_ops.py
@@ -17,8 +17,7 @@
 )
 import torch.fx as fx
 
-if TYPE_CHECKING:
-    from ..lang.wave_types import Memory, Register
+from ..lang.wave_types import Memory, Register, IndexMapping
 from .._support.indexing import IndexExpr, IndexSymbol, IndexSequence
 from .._support.dtype import DataType
 from .._support.regions import RegionGraph
@@ -339,7 +338,7 @@ def index(self, value: Any):
                 assert isinstance(
                     key, IndexSequence
                 ), f"Expected IndexSequence, got {key}"
-                if not hasattr(self.fx_node, "index"):
+                if not hasattr(self.fx_node, "index") or self.fx_node.index is None:
                     self.fx_node.index = {}
                 self.fx_node.index[dim] = key
         else:
@@ -502,6 +501,7 @@ class Allocate(CustomOp):
     """
 
     shape: tuple[IndexExpr]
+    distributed_shape: tuple[IndexExpr]
     dtype: DataType
     address_space: AddressSpace
 
@@ -509,6 +509,10 @@ class Allocate(CustomOp):
     def indexing_dims(self) -> list[IndexSymbol]:
         return list(self.shape)
 
+    @property
+    def type(self) -> "Memory":
+        return Memory[*self.shape, self.address_space, self.dtype]
+
 
 @define_op("register")
 @dataclass
@@ -521,6 +525,10 @@ class NewRegister(CustomOp):
     def indexing_dims(self) -> list[IndexSymbol]:
         return list(self.shape)
 
+    @property
+    def type(self) -> "Register":
+        return Register[*self.shape, self.dtype]
+
 
 @define_op("mma")
 @dataclass
@@ -551,6 +559,10 @@ def rhs_type(self) -> Memory:
     def acc_type(self) -> Memory:
         return get_custom(self.acc).type
 
+    @property
+    def type(self) -> Memory:
+        return self.acc_type
+
     def operand_index(
         self, operand_map: dict[IndexSymbol, int], shape: list[IndexExpr]
     ) -> list[IndexSequence]:
@@ -572,7 +584,7 @@ def rhs_index(self) -> list[IndexSequence]:
     @property
     def acc_index(self) -> list[IndexSequence]:
         operand_map = {MMA_LHS: 0, MMA_RHS: 0, MMA_ACC: 1}
-        if self.acc.type is None:
+        if self.acc_type is None:
             return None
         return self.operand_index(operand_map, self.acc_type.symbolic_shape)
 
@@ -598,11 +610,11 @@ def indexing_dims(self) -> list[IndexSymbol]:
         if self.mapping is not None:
             return list(self.mapping.output_shape)
         # TODO: This could contain ints.
-        return list(self.memory.type.symbolic_shape)
+        return list(self.type.symbolic_shape)
 
     @property
     def type(self) -> "Memory":
-        return self.memory.type
+        return get_custom(self.memory).type
 
 
 @define_op("reduction")
@@ -663,11 +675,11 @@ def indexing_dims(self) -> list[IndexSymbol]:
         if self.mapping is not None:
             return list(self.mapping.input_shape)
         # TODO: This could contain ints.
-        return list(self.memory.type.symbolic_shape)
+        return list(self.type.symbolic_shape)
 
     @property
     def type(self) -> "Memory":
-        return self.memory.type
+        return get_custom(self.memory).type
 
 
 @define_op("get_result")
diff --git a/shark_turbine/kernel/wave/codegen.py b/shark_turbine/kernel/wave/codegen.py
index 84a46a82..f30ac369 100644
--- a/shark_turbine/kernel/wave/codegen.py
+++ b/shark_turbine/kernel/wave/codegen.py
@@ -5,7 +5,9 @@
 import torch.fx as fx
 
 from ..compiler.ir import (
+    Attribute,
     DenseElementsAttr,
+    FloatAttr,
     IndexType,
     InsertionPoint,
     IntegerAttr,
@@ -17,9 +19,11 @@
     ShapedType,
     Value,
     VectorType,
+    amdgpu_d,
     arith_d,
     func_d,
     gpu_d,
+    memref_d,
     stream_d,
     vector_d,
 )
@@ -27,7 +31,7 @@
 
 # TK infrastructure imports.
 from shark_turbine.kernel.lang.global_symbols import *
-from ..ops.wave_ops import write, register, mma, read, reduction, get_custom
+from ..ops.wave_ops import write, register, mma, read, reduction, get_custom, allocate
 from ..lang.wave_types import IndexMapping
 from ..compiler.base import CodegenError, ValidationError, NDEBUG
 from ..compiler.kernel_codegen import BoundKernelSignature
@@ -40,6 +44,7 @@
     cast_py_value,
     cast_vector,
 )
+from .constraints import Constraint, HardwareConstraint, MMAType
 
 # Indexing imports.
 from .._support.indexing import IndexingContext, IndexExpr, IndexSequence
@@ -51,6 +56,7 @@ class WaveEmitter:
 
     root_sig: BoundKernelSignature
     trace: CapturedTrace
+    constraints: list[Constraint]
     ip: InsertionPoint = None
     OP_HANDLERS: ClassVar[dict[str, Callable[["WaveEmitter", fx.Node], None]]] = {}
     _node_values: ClassVar[dict[fx.Node, List[IRProxyValue]]] = {}
@@ -102,6 +108,7 @@ def lookup_node_values(self, node: fx.Node) -> List[Value]:
         if values is None:
             values = [self.root_sig.resolve_by_reference(("node", node))]
             self._node_values[node] = values
+        values = [v.ir_value if isinstance(v, IRProxyValue) else v for v in values]
         return values
 
     def bind_node_proxy(self, node: fx.Node, proxy: IRProxyValue):
@@ -203,6 +210,14 @@ def gen_sympy_index(emitter: WaveEmitter, expr: sympy.Expr) -> OpResult:
     return stack[0]
 
 
+def get_constant_attr(value: Any, element_type: IrType) -> Attribute:
+    if _is_integer_like_type(element_type):
+        return IntegerAttr.get(element_type, int(value))
+    if _is_float_type(element_type):
+        return FloatAttr.get(element_type, float(value))
+    raise CodegenError(f"Cannot create a constant attribute for type `{element_type}`")
+
+
 def handle_op(op: Callable[..., Any]):
     def decorator(
         f: Callable[[WaveEmitter, fx.Node], None]
@@ -220,7 +235,36 @@ def decorator(
 
 @handle_op(register)
 def handle_register(emitter: WaveEmitter, node: fx.Node):
-    raise NotImplementedError("Register: Currently only stub implementation")
+    try:
+        shape, dtype, value = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+    if hasattr(node, "thread_shape"):
+        shape = [node.thread_shape]
+    vector_shape = cast_py_literal(emitter, shape)
+    element_type = IrType.parse(dtype.ir_type_asm())
+    vector_type = VectorType.get(vector_shape, element_type)
+    register = arith_d.ConstantOp(
+        vector_type,
+        DenseElementsAttr.get_splat(
+            vector_type, get_constant_attr(value, element_type)
+        ),
+    ).result
+    emitter.bind_node_proxy(node, IRProxyValue(register))
+
+
+@handle_op(allocate)
+def handle_allocate(emitter: WaveEmitter, node: fx.Node):
+    try:
+        shape, distributed_shape, dtype, address_space = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+    memref_shape = cast_py_literal(emitter, distributed_shape)
+    element_type = IrType.parse(dtype.ir_type_asm())
+    address_space = Attribute.parse("#gpu.address_space<workgroup>")
+    memref_type = MemRefType.get(memref_shape, element_type, None, address_space)
+    alloc = memref_d.alloc(memref_type, [], [])
+    emitter.bind_node_proxy(node, IRProxyValue(alloc))
 
 
 def _get_start_indices(
@@ -352,7 +396,8 @@ def handle_read(emitter: WaveEmitter, node: fx.Node):
             is_read=True,
         )
 
-        zero = arith_d.ConstantOp(vector_type.element_type, 0)
+        zero = int(0) if _is_integer_like_type(element_type) else float(0)
+        zero = arith_d.ConstantOp(vector_type.element_type, zero)
         passthru = vector_d.splat(vector_type, zero)
 
         result = vector_d.gather(
@@ -373,11 +418,13 @@ def handle_write(emitter: WaveEmitter, node: fx.Node):
     kb_dest, kb_ir_type, kb_py_type = cast_kernel_buffer(emitter, memory)
     insert_vector = cast_vector(emitter, register, element_type=kb_ir_type.element_type)
     insert_type = VectorType(insert_vector.type)
+    vector_shape = cast_py_literal(emitter, (elements_per_thread,))
 
     # TODO: Support elements_per_thread size mismatch and broadcasting
-    assert tuple(insert_type.shape) == (
-        elements_per_thread,
-    ), f"Shape doesn't match: {tuple(insert_type.shape)} and {(elements_per_thread,)}"
+
+    assert (
+        tuple(insert_type.shape) == vector_shape
+    ), f"Shape doesn't match: {tuple(insert_type.shape)} and {(vector_shape)}"
 
     if not hasattr(node, "index"):
         raise ValidationError("codegen expected read to have index attr.")
@@ -412,9 +459,48 @@ def handle_write(emitter: WaveEmitter, node: fx.Node):
 ###############################################################################
 
 
+def emit_mfma(
+    m: int, n: int, k: int, vector_type: VectorType, acc: Value, values: list[Value]
+):
+    m = get_constant_attr(m, IntegerType.get_signless(32))
+    n = get_constant_attr(n, IntegerType.get_signless(32))
+    k = get_constant_attr(k, IntegerType.get_signless(32))
+    blocks = get_constant_attr(1, IntegerType.get_signless(32))
+
+    result = amdgpu_d.mfma(
+        dest_d=vector_type,
+        m=m,
+        n=n,
+        k=k,
+        blocks=blocks,
+        source_a=values[0],
+        source_b=values[1],
+        dest_c=acc,
+    )
+    return result
+
+
 @handle_op(mma)
 def handle_mma(emitter: WaveEmitter, node: fx.Node):
-    raise NotImplementedError("MMA: Currently only stub implementation")
+    try:
+        lhs, rhs, acc = node.args
+        acc = cast_vector(emitter, acc)
+        values = [lhs, rhs]
+        for i in range(len(values)):
+            values[i] = cast_vector(emitter, values[i])
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    vector_type = VectorType(acc.type)
+    result = None
+    for constraint in emitter.constraints:
+        if isinstance(constraint, HardwareConstraint):
+            m, n, k = constraint.mma_matrix_shapes
+            result = emit_mfma(m, n, k, vector_type, acc, values)
+            break
+
+    if result:
+        emitter.bind_node_proxy(node, IRProxyValue(result))
 
 
 @handle_op(operator.add)
diff --git a/shark_turbine/kernel/wave/constraints.py b/shark_turbine/kernel/wave/constraints.py
index d725a719..3917da13 100644
--- a/shark_turbine/kernel/wave/constraints.py
+++ b/shark_turbine/kernel/wave/constraints.py
@@ -213,3 +213,19 @@ def apply(self) -> IndexSequence:
         if self.wave_id is None:
             raise ValueError("Index is being computed without setting wave id")
         return IndexSequence(self.tile_size * self.wave_id, 1)
+
+
+def get_workgroup_distributed_shape(
+    shape: list[IndexExpr], constraints: list[WorkgroupConstraint]
+) -> tuple[IndexExpr]:
+    """
+    Given a shape and workgroup constraints, returns the shape
+    of the tensor after it has been distributed along workgroup dimensions.
+    """
+    distributed_shape = [s for s in shape]
+    for i, dim in enumerate(shape):
+        for constraint in constraints:
+            if isinstance(constraint, WorkgroupConstraint):
+                if dim == constraint.dim:
+                    distributed_shape[i] = constraint.tile_size
+    return tuple(distributed_shape)
diff --git a/shark_turbine/kernel/wave/expansion.py b/shark_turbine/kernel/wave/expansion.py
index c49ca119..62f8b9c5 100644
--- a/shark_turbine/kernel/wave/expansion.py
+++ b/shark_turbine/kernel/wave/expansion.py
@@ -74,7 +74,7 @@ def get_indexed_dims(
     """
     if isinstance(nodeOrDims, CustomOp):
         nodeOrDims = nodeOrDims.indexing_dims
-    return tuple((key, all_dims[key]) for key in nodeOrDims)
+    return tuple((key, all_dims[key]) for key in nodeOrDims if key in all_dims)
 
 
 def get_last(node_list: fx.graph._node_list) -> fx.Node:  # type: ignore
@@ -173,8 +173,11 @@ def set_node_index(
                     index_seq.start += constraint.apply().start
 
         if index_seq is not None:
-            index_seq.start += dim_scaling[dim] * dim_tile_size[dim]
+            if dim in dim_scaling and dim in dim_tile_size:
+                index_seq.start += dim_scaling[dim] * dim_tile_size[dim]
             custom.index = {dim: index_seq}
+        else:
+            custom.index = {dim: IndexSequence(0, 1, 1)}
 
     setattr(custom.fx_node, "index", custom.index)
 
@@ -464,6 +467,7 @@ def _handle_reduction_dim(
                 # placeholder which will not trigger further expansion.
                 index = user.node_args.index(carried_node)
                 dummy = Placeholder("dummy").add_to_graph(user.graph)
+                dummy.type = None
 
                 saved_arg = user.node_args[index]
                 user.update_arg(index, dummy)
diff --git a/shark_turbine/kernel/wave/promotion.py b/shark_turbine/kernel/wave/promotion.py
index 2b36fa62..1e203971 100644
--- a/shark_turbine/kernel/wave/promotion.py
+++ b/shark_turbine/kernel/wave/promotion.py
@@ -3,15 +3,16 @@
 from .._support.indexing import IndexingContext
 from ..ops.wave_ops import *
 from ..lang.global_symbols import *
+from .constraints import Constraint, get_workgroup_distributed_shape
 
 logger = get_logger("turbine.wave.promotion")
 
 
 def apply_promotion_pattern(custom_node: Read | Write, allocate_node: Allocate):
     match custom_node:
-        case Read(
-            memory, elements_per_thread
-        ) if memory.type.address_space != allocate_node.address_space:
+        case Read(memory, elements_per_thread) if get_custom(
+            memory
+        ).type.address_space != allocate_node.address_space:
             promoted_read = Read(
                 allocate_node.fx_node, elements_per_thread
             ).add_to_graph(custom_node.graph)
@@ -20,11 +21,11 @@ def apply_promotion_pattern(custom_node: Read | Write, allocate_node: Allocate):
                 Write(
                     custom_node.fx_node, allocate_node.fx_node, elements_per_thread
                 ).add_to_graph(custom_node.graph)
-        case _:
-            logger.error(f"Attempted to promoted unsupported operator {custom_node}")
 
 
-def promote_node(node: Read | Write, address_space: IndexSymbol):
+def promote_node(
+    node: Read | Write, address_space: IndexSymbol, constraints: list[Constraint]
+):
     """Promotes the given operand in the provided graph
     to the specified address space.
 
@@ -35,20 +36,29 @@ def promote_node(node: Read | Write, address_space: IndexSymbol):
 
     assert isinstance(node, Read) or isinstance(node, Write)
     with node.graph.inserting_before(node.fx_node.next):
+        workgroup_distributed_shape = get_workgroup_distributed_shape(
+            node.type.symbolic_shape, constraints
+        )
         allocate_node = Allocate(
-            node.type.symbolic_shape, node.type.dtype, address_space
+            node.type.symbolic_shape,
+            workgroup_distributed_shape,
+            node.type.dtype,
+            address_space,
         )
         allocate_node.add_to_graph(node.graph)
         apply_promotion_pattern(node, allocate_node)
 
 
-def promote_placeholders(graph: CapturedTrace):
-    for node in graph.get_root_graph().nodes:
+def promote_placeholders(graph: CapturedTrace, constraints: list[Constraint]):
+    read_or_write_nodes = graph.walk(
+        lambda node: isinstance(get_custom(node), Read)
+        or isinstance(get_custom(node), Write)
+    )
+    for node in read_or_write_nodes:
         custom = get_custom(node)
-        if isinstance(custom, Read) or isinstance(custom, Write):
-            if not custom.type:
-                continue
-            idxc = IndexingContext.current()
-            address_space = custom.type.address_space.subs(idxc.subs)
-            if address_space == SHARED_ADDRESS_SPACE:
-                promote_node(custom, address_space)
+        if not custom.type:
+            continue
+        idxc = IndexingContext.current()
+        address_space = custom.type.address_space.subs(idxc.subs)
+        if address_space == SHARED_ADDRESS_SPACE:
+            promote_node(custom, address_space, constraints)
diff --git a/shark_turbine/kernel/wave/register_analysis.py b/shark_turbine/kernel/wave/register_analysis.py
new file mode 100644
index 00000000..f7a4d79d
--- /dev/null
+++ b/shark_turbine/kernel/wave/register_analysis.py
@@ -0,0 +1,32 @@
+from .._support.tracing import CapturedTrace
+from ...support.logging import get_logger
+from ..ops.wave_ops import *
+
+logger = get_logger("turbine.wave.register_analysis")
+
+
+def determine_register_shape(trace: CapturedTrace) -> None:
+    """
+    Each register op is annotated with the wave shape of the register. This
+    function determines the thread shape of the register based on the uses
+    of the register in the graph.
+    """
+    register_nodes = trace.walk(lambda node: isinstance(get_custom(node), NewRegister))
+    if not register_nodes:
+        return
+    for node in register_nodes:
+        custom_node = get_custom(node)
+        for user in node.users.keys():
+            custom_user = get_custom(user)
+            if isinstance(custom_user, MMA):
+                arg_index = user.args.index(node)
+                if arg_index == 0:
+                    custom_node.fx_node.thread_shape = custom_user.lhs_index[0].size
+                if arg_index == 1:
+                    custom_node.fx_node.thread_shape = custom_user.rhs_index[0].size
+                if arg_index == 2:
+                    custom_node.fx_node.thread_shape = custom_user.acc_index[0].size
+            else:
+                raise NotImplementedError(
+                    f"Register shape propagation not implemented for {user}"
+                )
diff --git a/shark_turbine/kernel/wave/utils.py b/shark_turbine/kernel/wave/utils.py
new file mode 100644
index 00000000..d7088e63
--- /dev/null
+++ b/shark_turbine/kernel/wave/utils.py
@@ -0,0 +1,35 @@
+from ..compiler.ir import (
+    builtin_d,
+    InsertionPoint,
+    Location,
+    Operation,
+    transform_d,
+    UnitAttr,
+)
+
+from iree.compiler.dialects.transform import (
+    interpreter as transform_interpreter,
+    any_op_t,
+)
+
+
+def canonicalize_module(module: Operation):
+    with module.context, Location.unknown():
+        transform_module = builtin_d.Module.create()
+        transform_module_op = module.operation
+        transform_module_op.attributes["transform.with_named_sequence"] = UnitAttr.get()
+        with InsertionPoint(transform_module.body):
+            named_sequence = transform_d.NamedSequenceOp(
+                "__transform_main", [any_op_t()], []
+            )
+            with InsertionPoint(named_sequence.body):
+                target = named_sequence.body.arguments[0]
+                apply_patterns = transform_d.ApplyPatternsOp(target)
+                with InsertionPoint(apply_patterns.regions[0].blocks[0]):
+                    transform_d.apply_patterns_canonicalization()
+                transform_d.YieldOp([target])
+        transform_interpreter.apply_named_sequence(
+            module,
+            transform_module.body.operations[0],
+            transform_module,
+        )
diff --git a/shark_turbine/kernel/wave/wave.py b/shark_turbine/kernel/wave/wave.py
index 26f5b007..cc30cce7 100644
--- a/shark_turbine/kernel/wave/wave.py
+++ b/shark_turbine/kernel/wave/wave.py
@@ -17,10 +17,12 @@
 from .expansion import expand_graph
 from .promotion import promote_placeholders
 from .hoisting import hoist_allocs
+from .utils import canonicalize_module
 from ..lang import Grid, IndexMapping
 from ..lang.global_symbols import *
 from ..ops import wave_ops
 from ..ops.wave_ops import Reduction, CustomOp, get_custom
+from .register_analysis import determine_register_shape
 from .._support.indexing import IndexingContext, IndexExpr
 import shark_turbine.kernel.lang as tkl
 from .._support.tracing import (
@@ -173,12 +175,15 @@ def _trace_and_get_kernel_signature(
         idxc.finalize()
 
         # Promote the placeholders to the appropriate address space.
-        promote_placeholders(graph)
+        promote_placeholders(graph, self.constraints)
         hoist_allocs(graph)
 
         # Expansion
         expand_graph(graph, self.constraints)
 
+        # Register analysis to determine register shapes.
+        determine_register_shape(graph)
+
         self.grid_type.dims = [1, 1, 1]
         for constraint in self.workgroup_constraints:
             self.grid_type.dims[constraint.workgroup_dim] = (
@@ -197,10 +202,13 @@ def _trace_and_get_kernel_signature(
         exe = dispatch_codegen.StreamExecutable(mb, name=entrypoint_name)
         dispatch_entrypoint = exe.define_entrypoint(entrypoint_name, kernel_sig, grid)
 
-        emitter = WaveEmitter(dispatch_entrypoint, graph)
+        emitter = WaveEmitter(dispatch_entrypoint, graph, self.constraints)
         emitter.emit(graph.get_root_graph())
         emitter.finish()
 
+        if "canonicalize" in kwargs and kwargs["canonicalize"]:
+            canonicalize_module(mb.module_op)
+
         return mb, graph
 
     def test_execute(self, args, kwargs):
diff --git a/tests/kernel/wave/wave_gemm_test.py b/tests/kernel/wave/wave_gemm_test.py
index 21b12c2f..8542bc64 100644
--- a/tests/kernel/wave/wave_gemm_test.py
+++ b/tests/kernel/wave/wave_gemm_test.py
@@ -77,7 +77,8 @@ def repeat(acc: tkl.Register[M, N, tkl.f32]) -> tkl.Register[M, N, tkl.f32]:
             K: 256,
         }
         with pytest.raises(
-            NotImplementedError, match="Currently only stub implementation"
+            NotImplementedError,
+            match="Register shape propagation not implemented for reduction",
         ):
             with tk.gen.TestLaunchContext(hyperparams):
                 a = torch.randn(64, 256, dtype=torch.float16)

From b3612ab14ab525d81656259add603aacdcbf1f4a Mon Sep 17 00:00:00 2001
From: Harsh Menon <harsh@nod-labs.com>
Date: Fri, 16 Aug 2024 20:26:41 -0700
Subject: [PATCH 2/2] Address Ivan's comments

Signed-off-by: Harsh Menon <harsh@nod-labs.com>
---
 lit_tests/kernel/wave/codegen.py         |  5 +++--
 shark_turbine/kernel/_support/tracing.py |  5 ++++-
 shark_turbine/kernel/wave/codegen.py     | 26 ++++++++++++++----------
 shark_turbine/kernel/wave/constraints.py |  2 +-
 shark_turbine/kernel/wave/wave.py        |  2 +-
 5 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/lit_tests/kernel/wave/codegen.py b/lit_tests/kernel/wave/codegen.py
index 6c80e55b..b63c3145 100644
--- a/lit_tests/kernel/wave/codegen.py
+++ b/lit_tests/kernel/wave/codegen.py
@@ -289,12 +289,13 @@ def mma(
             STORE_ELEMS_PER_THREAD: 4,
             ADDRESS_SPACE: SHARED_ADDRESS_SPACE,
             ADDRESS_SPACE_0: GLOBAL_ADDRESS_SPACE,
-        }
+        },
+        canonicalize=True,
     ):
         a = torch.randn(64, 32, dtype=torch.float16)
         b = torch.randn(128, 32, dtype=torch.float16)
         c = torch.zeros(64, 128, dtype=torch.float32)
-        print(mma(a, b, c, canonicalize=True).module_op)
+        print(mma(a, b, c).module_op)
 
         # CHECK: func.func @mma(%[[ARG0:.+]]: !stream.binding, %[[ARG1:.+]]: !stream.binding, %[[ARG2:.+]]: !stream.binding) {
         # CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
diff --git a/shark_turbine/kernel/_support/tracing.py b/shark_turbine/kernel/_support/tracing.py
index ec609561..d1a41604 100644
--- a/shark_turbine/kernel/_support/tracing.py
+++ b/shark_turbine/kernel/_support/tracing.py
@@ -420,8 +420,9 @@ def test_execute(self, args, kwargs):
 class LaunchContext(ABC):
     __tk_context_idname__ = "ExecutionContext"
 
-    def __init__(self, constant_bindings: Dict[IndexSymbol, int] = {}):
+    def __init__(self, constant_bindings: Dict[IndexSymbol, int] = {}, **kwargs):
         self.constant_bindings = constant_bindings
+        self.kwargs = kwargs
 
     @staticmethod
     def current() -> "LaunchContext":
@@ -464,6 +465,8 @@ def launch(self, launchable: Launchable, args, kwargs):
 
 class TestLaunchContext(LaunchContext):
     def launch(self, launchable: Launchable, args, kwargs):
+        if self.kwargs:
+            kwargs.update(self.kwargs)
         return launchable.test_execute(args, kwargs)
 
 
diff --git a/shark_turbine/kernel/wave/codegen.py b/shark_turbine/kernel/wave/codegen.py
index f30ac369..8a1e8ccd 100644
--- a/shark_turbine/kernel/wave/codegen.py
+++ b/shark_turbine/kernel/wave/codegen.py
@@ -485,22 +485,26 @@ def handle_mma(emitter: WaveEmitter, node: fx.Node):
     try:
         lhs, rhs, acc = node.args
         acc = cast_vector(emitter, acc)
-        values = [lhs, rhs]
-        for i in range(len(values)):
-            values[i] = cast_vector(emitter, values[i])
+        values = [cast_vector(emitter, val) for val in [lhs, rhs]]
     except ValueError as e:
         raise ValidationError("Malformed arguments") from e
 
     vector_type = VectorType(acc.type)
+
+    hardware_constraints = [
+        constraint
+        for constraint in emitter.constraints
+        if isinstance(constraint, HardwareConstraint)
+    ]
+    if not hardware_constraints:
+        raise CodegenError("No hardware constraints found.")
+
     result = None
-    for constraint in emitter.constraints:
-        if isinstance(constraint, HardwareConstraint):
-            m, n, k = constraint.mma_matrix_shapes
-            result = emit_mfma(m, n, k, vector_type, acc, values)
-            break
-
-    if result:
-        emitter.bind_node_proxy(node, IRProxyValue(result))
+    for constraint in hardware_constraints:
+        m, n, k = constraint.mma_matrix_shapes
+        result = emit_mfma(m, n, k, vector_type, acc, values)
+
+    emitter.bind_node_proxy(node, IRProxyValue(result))
 
 
 @handle_op(operator.add)
diff --git a/shark_turbine/kernel/wave/constraints.py b/shark_turbine/kernel/wave/constraints.py
index 3917da13..989171ac 100644
--- a/shark_turbine/kernel/wave/constraints.py
+++ b/shark_turbine/kernel/wave/constraints.py
@@ -222,7 +222,7 @@ def get_workgroup_distributed_shape(
     Given a shape and workgroup constraints, returns the shape
     of the tensor after it has been distributed along workgroup dimensions.
     """
-    distributed_shape = [s for s in shape]
+    distributed_shape = list(shape)
     for i, dim in enumerate(shape):
         for constraint in constraints:
             if isinstance(constraint, WorkgroupConstraint):
diff --git a/shark_turbine/kernel/wave/wave.py b/shark_turbine/kernel/wave/wave.py
index cc30cce7..fe10ee46 100644
--- a/shark_turbine/kernel/wave/wave.py
+++ b/shark_turbine/kernel/wave/wave.py
@@ -206,7 +206,7 @@ def _trace_and_get_kernel_signature(
         emitter.emit(graph.get_root_graph())
         emitter.finish()
 
-        if "canonicalize" in kwargs and kwargs["canonicalize"]:
+        if kwargs.get("canonicalize", False):
             canonicalize_module(mb.module_op)
 
         return mb, graph