Skip to content

Commit

Permalink
[LLVMGPU] Add GPUCombineValueBarriersPass to TileAndFuse pipeline (ir…
Browse files Browse the repository at this point in the history
…ee-org#18446)

This adds the `GPUCombineValueBarriersPass` to the TileAndFusePipeline
to improve barrier placement.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
  • Loading branch information
Max191 authored Sep 5, 2024
1 parent 17de12f commit d55785d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 32 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
addGPUVectorizationPasses(funcPassManager);
funcPassManager.addPass(createCleanupBufferAllocViewPass());
funcPassManager.addPass(createGPUCombineValueBarriersPass());

// Step 7. Bufferize.
addGPUBufferizePasses(funcPassManager);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,13 @@ hal.executable public @main {
// CHECK-DAG: memref.alloc() : memref<64x8xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: memref.alloc() : memref<64x8xf16, #gpu.address_space<workgroup>>
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>)
// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16>
// CHECK: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]], %[[RHS_ALLOC:[A-Za-z0-9]+]]
// CHECK: gpu.barrier
// CHECK: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<8x4xf16>
// CHECK: gpu.barrier
// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2xf16>
// CHECK: vector.transfer_write %[[RHS_RD]], %[[RHS_ALLOC:[A-Za-z0-9]+]]
// CHECK: gpu.barrier
// CHECK: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf16>
// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<8x4xf16>
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf16>
// CHECK: gpu.barrier
// CHECK: %[[MM:.+]] = vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]
// CHECK: scf.yield %[[MM]]
Expand Down Expand Up @@ -104,18 +102,16 @@ hal.executable public @main {
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>)
// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
// CHECK: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
// CHECK: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
// CHECK-DAG: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
// CHECK: scf.yield
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
Expand Down Expand Up @@ -188,19 +184,17 @@ hal.executable private @main {
// CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x2x2x4x1xf32>)
// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
// CHECK: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
// CHECK: %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16>
// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
// CHECK: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16>
// CHECK-DAG: vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16>
// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16>
// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32>
// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32>
Expand Down Expand Up @@ -254,18 +248,16 @@ hal.executable public @main {
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>)
// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2x8xf16>
// CHECK: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2x8xf16>
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2x8xf16>
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2x8xf16>
// CHECK: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16>
// CHECK: gpu.barrier
// CHECK: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
// CHECK-DAG: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// CHECK: scf.yield
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32>
Expand Down

0 comments on commit d55785d

Please sign in to comment.