diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index 3f4a0b8e9..bfff5aa58 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -6,6 +6,7 @@ #include "iree-amd-aie/Transforms/Passes.h" #include "iree/compiler/Codegen/Utils/Utils.h" +#include "llvm/ADT/StringExtras.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" @@ -16,16 +17,128 @@ #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/IR/Iterators.h" #include "mlir/IR/PatternMatch.h" #define DEBUG_TYPE "iree-amdaie-tile-and-fuse" - namespace mlir::iree_compiler::AMDAIE { namespace { +enum class GPUGroupType { Block, Thread }; + +/// Assign GPU dialect thread/block mapping attributes to tiled dimensions. +/// The returned vector's size is the number of non-zero values in +/// `tileSizesVal`. Failure is returned if it is not possible to assign +/// mapping attributes to the dimensions. +FailureOr> getGPUMappingAttributes( + ArrayRef tileSizesVal, GPUGroupType groupType, + TilingInterface op) { + MLIRContext *context = op.getContext(); + + // There is one induction variables in the scf.forall for each of the + // non-zero tile sizes. Recall that a '0' tile size corresponds to 'do + // not tile'. + uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(), + [](int64_t t) { return t != 0; }); + + uint32_t nbIndVarsAboveOne = + std::count_if(tileSizesVal.begin(), tileSizesVal.end(), + [](int64_t t) { return t > 1; }); + + // The mlir::gpu::MappingId enum supports 13 dimensions, see: + // https://github.com/llvm/llvm-project/blob/main + // /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td + if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) { + return op->emitOpError("has too many dimensions to tile, ") + << "there are only " << mlir::gpu::getMaxEnumValForMappingId() + << " dimensions available in the mlir::gpu dialect, but " + << nbIndVars << " are required here.."; + } + + // Currently we expect only 2 tiled dimensions to be >1 when mapping + // to thread dimensions. This is to target the 2-D AIE array. + // + // TODO(newling) if there are 3+ dimensions, we probably need to collapse + // them into just 2. I'm leaving this as a follow-up task. Basically, instead + // of + // ```(i,j,k) in (2,3,5)``` + // we want + // ```(i,l) in (2,15)``` + // with then + // j=l/5 and k=l%5. + // + // Once the above is implemented, we can safely remove the following check: + if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) { + auto tileSizesStr = std::to_string(tileSizesVal[0]); + for (unsigned i = 1; i < tileSizesVal.size(); ++i) { + tileSizesStr += ", " + std::to_string(tileSizesVal[i]); + } + return op->emitOpError("has requested tile sizes [") + << tileSizesStr + << "]. Currently we only support tiling thread dimensions " + << "with at most 2 dimensions having a tile size greater than 1, " + << "there are " << nbIndVarsAboveOne << " here."; + } + + auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute { + auto id = static_cast(i); + if (groupType == GPUGroupType::Block) + return gpu::GPUBlockMappingAttr::get(context, id); + else if (groupType == GPUGroupType::Thread) + return gpu::GPUThreadMappingAttr::get(context, id); + else { + assert(false && "Unhandled group type, must be thread or block."); + } + }; + + // Map an integer to an Attribute as follows: + // 0 -> DimY + // 1 -> DimX + // 2 -> DimZ + // 3 -> LinearDim0 + // 4 -> LinearDim1 + // etc. + // + // Note that 0 and 1 are effectively swapped, because for AIE we want to + // map the first dimension to AIE array columns (or something like that). + auto getAttribute = [&](uint32_t i) -> Attribute { + if (i == 0) + return getMappingAttributeForDimension(1); + else if (i == 1) + return getMappingAttributeForDimension(0); + else + return getMappingAttributeForDimension(i); + }; + + // We give priority to tiling dimensions of size > 1, so that they + // preferentially get DimY and DimX. + SmallVector mapping(tileSizesVal.size(), {}); + uint32_t nAttributes = 0; + for (uint32_t i = 0; i < tileSizesVal.size(); ++i) { + if (tileSizesVal[i] > 1) { + mapping[i] = getAttribute(nAttributes); + ++nAttributes; + } + } + for (uint32_t i = 0; i < tileSizesVal.size(); ++i) { + if (!mapping[i] && tileSizesVal[i] > 0) { + mapping[i] = getAttribute(nAttributes); + ++nAttributes; + } + } + + // Squeeze out the empty attributes (corresponding to '0's in tileSizesVal). + SmallVector finalMapping; + finalMapping.reserve(nbIndVars); + for (Attribute attr : mapping) { + if (attr) finalMapping.push_back(attr); + } + return finalMapping; +} + /// Utility function to check if any of the reduction dimension is being tiled. static bool isTilingReductionDimension(TilingInterface consumerOp, SmallVector tileSizesVal) { @@ -157,27 +270,33 @@ void AMDAIETileAndFusePass::runOnOperation() { SmallVector tileSizes = getAsIndexOpFoldResult(context, tileSizesVal); + auto options = scf::SCFTilingOptions().setTileSizes(tileSizes); // When tiling using scf.for we do not need to set any mapping. if (!useSCFFor) { options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); - // Here we assume there are always two levels of parallel (scf.forall) - // loops, and the first level of tiling is always using scf.forall and - // mapped to blocks. Currently we are not using mapping attributes for - // Conv2d ops, because there could be four parallel tiling dimensions. - // TODO (vivian): create AIE specific mapping attributes. - if (!isa(consumerOp.getOperation())) { - if (tilingLevel == 0) { - options.setMapping( - {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY), - gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)}); - } else { - options.setMapping( - {gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY), - gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)}); - } + + // Currently only thread groups are used in lowering, blocks get unrolled + // temporally. In theory we should be able to just not add any block group + // dimensions to the outer scf.forall operation, but currently this results + // in compilation failure. What happens is + // 1) without any block group dimensions, the scf.forall operation can be + // be canonicalized away if the tile sizes are all 1 (small matmul, for + // example). Leaving only the inner thread scf.forall. + // 2) certain passes expect an outer scf.forall operation, so if it is + // canonicalized away, the pass fails. + // So for now we're keeping the block group dimension here, but should + // be able to compile without any block group dimensions TODO(newling) + auto groupType = + tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread; + + auto maybeMapping = + getGPUMappingAttributes(tileSizesVal, groupType, consumerOp); + if (failed(maybeMapping)) { + return signalPassFailure(); } + options.setMapping(maybeMapping.value()); } IRRewriter rewriter(context); @@ -205,8 +324,7 @@ void AMDAIETileAndFusePass::runOnOperation() { // Fuse all Linalg ops (can be generalized later) .Default([&](Operation *op) { return op->getDialect() == - rewriter.getContext() - ->getLoadedDialect(); + context->getLoadedDialect(); }); return {fusableOp, false}; }); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b759c2a27..7f3d9cf12 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -644,6 +644,23 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createAMDAIEBridgeToAIRPass()); + // Running canonicalization for all pipelines here results in failures. + // Example + // ``` + // 'memref.cast' op is an unsupported operation. This pass currently only + // supports AllocOp and SubViewOp as inputs. + // ``` + // It is currently required for the convolution pipeline though, to remove the + // extra (size-1) thread- and group- dimensions. + // + // TODO(newling) there are better solutions like: + // 1) make canonicalization work for scf.forall + // 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls. + // 3) resolve above 'unsupproted operation' error. + if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) { + passManager.addPass(createCanonicalizerPass()); + } + // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass. if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline) passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir index 803a94395..407106fbe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir @@ -1,5 +1,5 @@ -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0 -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1 +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0 +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s | FileCheck %s --check-prefix=TILE-LEVEL-1 // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> { @@ -32,7 +32,25 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3 // TILE-LEVEL-0-SAME: { // TILE-LEVEL-0: linalg.fill // TILE-LEVEL-0: linalg.conv_2d_nhwc_hwcf -// TILE-LEVEL-0: } +// TILE-LEVEL-0: } {mapping = [#gpu.block, #gpu.block, #gpu.block]} + +// TILE-LEVEL-1: @conv_2d_nhwc_hwcf +// TILE-LEVEL-1: scf.forall +// TILE-LEVEL-1-SAME: { +// TILE-LEVEL-1: linalg.fill +// TILE-LEVEL-1: linalg.conv_2d_nhwc_hwcf +// TILE-LEVEL-1: } {mapping = [#gpu.thread, #gpu.thread, #gpu.thread, #gpu.thread]} + +// ----- + +func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<2x12x12x64xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + // expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}} + %2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + return %2 : tensor<2x12x12x64xf32> +} // -----