diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp index 807ab9d339eb..51898adc02d7 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp @@ -18,6 +18,23 @@ namespace mlir::iree_compiler { namespace { +/// Check if AllocOp has a CollapseShapeOp user. +static bool hasCollapseShapeUser(memref::AllocOp allocOp) { + SmallVector users(allocOp->getUsers()); + while (!users.empty()) { + auto user = users.pop_back_val(); + if (isa(user)) { + return true; + } + if (isa(user)) { + for (auto u : user->getUsers()) { + users.push_back(u); + } + } + } + return false; +} + /// Pad out the inner dimension of the `memref.alloc` op in order reduce the /// chances to have bank conflicts when reading 2D shapes within shared memory. static void padAlloc(MLIRContext *context, memref::AllocOp allocOp, @@ -28,6 +45,12 @@ static void padAlloc(MLIRContext *context, memref::AllocOp allocOp, int64_t innerDim = allocOpShape.back(); if (ShapedType::isDynamic(innerDim)) return; + + // Return if we have CollapseShape op as an user as padding in that case is + // unsupported. + if (hasCollapseShapeUser(allocOp)) + return; + Type elType = allocOp.getType().getElementType(); unsigned bitwidth = mlir::DataLayout::closest(allocOp).getTypeSizeInBits(elType); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir index befb2445ab24..b934772ffd34 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir @@ -47,6 +47,66 @@ func.func @pad_alloc_expand_shape(%a: memref<1024x1024xf32>) { return } +// ----- +// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape +// CHECK: %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space> +// CHECK: %[[C:.*]] = memref.collapse_shape %[[A]] {{\[}}[0], [1, 2], [3, 4]] +// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space> into +// CHECK-SAME: memref<4x32x64xf32, #gpu.address_space> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST_0]] {in_bounds = [true]} : +// CHECK-SAME: memref<1024x1024xf32>, vector<4xf32> +// CHECK: vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} : +// CHECK-SAME: vector<4xf32>, memref<4x32x64xf32, #gpu.address_space> + + +func.func @no_pad_alloc_collapse_shape(%a: memref<1024x1024xf32>) { + %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space> + %1 = memref.collapse_shape %0 [[0], [1, 2], [3, 4]] + : memref<4x2x16x8x8xf32, #gpu.address_space> into memref<4x32x64xf32, #gpu.address_space> + %c0 = arith.constant 0 : index + %cst_0 = arith.constant 0.000000e+00 : f32 + %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : + memref<1024x1024xf32>, vector<4xf32> + vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} : + vector<4xf32>, memref<4x32x64xf32, #gpu.address_space> + return +} + +// ----- + +// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape_throughsubview +// CHECK: %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space> +// CHECK: %[[S:.*]] = memref.subview %[[A]][0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1] : +// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space> to +// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space> +// CHECK: %[[C:.*]] = memref.collapse_shape %[[S]] {{\[}}[0], [1, 2], [3, 4]] +// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space> into +// CHECK-SAME: memref<4x32x64xf32, #gpu.address_space> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]] {in_bounds = [true]} : +// CHECK-SAME: memref<1024x1024xf32>, vector<4xf32> +// CHECK: vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} : +// CHECK-SAME: vector<4xf32>, memref<4x32x64xf32, #gpu.address_space> + + +func.func @no_pad_alloc_collapse_shape_throughsubview(%a: memref<1024x1024xf32>) { + %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space> + %subview = memref.subview %0[0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1] + : memref<4x2x16x8x8xf32, #gpu.address_space> to memref<4x2x16x8x8xf32, #gpu.address_space> + %1 = memref.collapse_shape %subview [[0], [1, 2], [3, 4]] + : memref<4x2x16x8x8xf32, #gpu.address_space> into memref<4x32x64xf32, #gpu.address_space> + %c0 = arith.constant 0 : index + %cst_0 = arith.constant 0.000000e+00 : f32 + %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} : + memref<1024x1024xf32>, vector<4xf32> + vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} : + vector<4xf32>, memref<4x32x64xf32, #gpu.address_space> + return +} + // ----- // CHECK-LABEL: func.func @pad_alloc_negative