From 0f8c087bd7b5816e9e570197010148608ea3f186 Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Mon, 19 Aug 2024 05:09:20 +0000 Subject: [PATCH 1/7] Buddy GPU GEMM linalg tensor opt - tiling --- midend/include/Dialect/CMakeLists.txt | 1 + .../include/Dialect/Transform/CMakeLists.txt | 3 + midend/include/Dialect/Transform/Passes.h | 17 + midend/include/Dialect/Transform/Passes.td | 10 + .../Transforms/TransformDialectInterpreter.h | 10 + .../Transform/Transforms/TransformInsertion.h | 19 ++ .../MatMulOptimization/CMakeLists.txt | 2 + .../MatMulOptimization/GPU/CMakeLists.txt | 11 + .../GPU/MatmulTilingOpt.cpp | 297 ++++++++++++++++++ midend/lib/Dialect/CMakeLists.txt | 1 + midend/lib/Dialect/Transform/CMakeLists.txt | 1 + .../Transform/Transforms/CMakeLists.txt | 16 + .../TransformDialectInterpreter.cpp | 45 +++ .../Transforms/TransformInsertion.cpp | 88 ++++++ 14 files changed, 521 insertions(+) create mode 100644 midend/include/Dialect/Transform/CMakeLists.txt create mode 100644 midend/include/Dialect/Transform/Passes.h create mode 100644 midend/include/Dialect/Transform/Passes.td create mode 100644 midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h create mode 100644 midend/include/Dialect/Transform/Transforms/TransformInsertion.h create mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt create mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp create mode 100644 midend/lib/Dialect/Transform/CMakeLists.txt create mode 100644 midend/lib/Dialect/Transform/Transforms/CMakeLists.txt create mode 100644 midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp create mode 100644 midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt index 8ab8f29f58..7b472d0d2d 100644 --- a/midend/include/Dialect/CMakeLists.txt +++ b/midend/include/Dialect/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(RVV) add_subdirectory(VectorExp) add_subdirectory(Gemmini) add_subdirectory(Sche) +add_subdirectory(Transform) \ No newline at end of file diff --git a/midend/include/Dialect/Transform/CMakeLists.txt b/midend/include/Dialect/Transform/CMakeLists.txt new file mode 100644 index 0000000000..a63c82dbdf --- /dev/null +++ b/midend/include/Dialect/Transform/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls) +add_public_tablegen_target(BuddyTransformPassIncGen) \ No newline at end of file diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h new file mode 100644 index 0000000000..1c11fc4010 --- /dev/null +++ b/midend/include/Dialect/Transform/Passes.h @@ -0,0 +1,17 @@ +#pragma once +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" +namespace mlir { +class ModuleOp; +// Generate the definition of Transform Passes +#define GEN_PASS_DECL +#include "Transform/Passes.h.inc" + +// Include the constructor of passes in Transform Dialect +#include "Transform/Transforms/TransformDialectInterpreter.h" +#include "Transform/Transforms/TransformInsertion.h" + +#define GEN_PASS_REGISTRATION +#include "Transform/Passes.h.inc" + +} \ No newline at end of file diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td new file mode 100644 index 0000000000..f3790312ce --- /dev/null +++ b/midend/include/Dialect/Transform/Passes.td @@ -0,0 +1,10 @@ +include "mlir/Pass/PassBase.td" + +// TransformDialectInterpreter +def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> { + let summary = "Apply transform dialect operations one by one"; + let constructor = "mlir::createTransformDialectInterpreter()"; + let options = [ + Option<"eraseAfter", "erase-after", "bool", "false", "erase Transform Ops after applied"> + ]; +} diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h new file mode 100644 index 0000000000..d8038c59ec --- /dev/null +++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h @@ -0,0 +1,10 @@ +#include "mlir/Pass/Pass.h" +#include + +namespace mlir { +class ModuleOp; + +std::unique_ptr> +createTransformDialectInterpreter(bool eraseAfter = false); + +} // namespace mlir \ No newline at end of file diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h new file mode 100644 index 0000000000..aca6a50177 --- /dev/null +++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h @@ -0,0 +1,19 @@ +#include "mlir/Pass/Pass.h" +#include +#include + +namespace mlir { +class ModuleOp; +class ImplicitLocOpBuilder; + +struct TransformInsertionConfig { + std::string funcAnchor; + std::string matchPrefix; + std::function opFilter; + std::function transformBuilder; +} + +std::unique_ptr> +createGenericTransformInsertionPass(const TransformInsertionConfig &config); + +} \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt index 8e726863eb..69405c053e 100644 --- a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt +++ b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt @@ -14,3 +14,5 @@ add_mlir_library(BatchMatMulOptimization add_mlir_library(MatMulParallelVectorization MatMulParallelVectorization.cpp ) + +add_subdirectory(GPU) \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt new file mode 100644 index 0000000000..03150ba4b0 --- /dev/null +++ b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_library(BuddyIRGPUMatmulOpt + DEPENDS + MhloDialect + MLIRBufferTransforms + + LINK_LIBS PUBLIC + MLIRIR + MLIRTransforms + MLIRLinalgExtTransformOps + BuddyUtils +) \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp new file mode 100644 index 0000000000..11daea6998 --- /dev/null +++ b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp @@ -0,0 +1,297 @@ +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/SmallSet.h" + +#include + +namespace mlir { + +struct GPUGemmCodegenConfigOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__buddyir_gpu_tile_gemm")}; + ListOption tileSizeConfig{ + *this, "tile-size-config", + llvm::cl::desc("An optional tile size config for tile matmul op.")}; + ListOption workgroupSize{ + *this, "workgroup-size", + llvm::cl::desc("An optional workgroup size config for tile matmul op.")}; + Option stages{ + *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."), + llvm::cl::init(3)}; +}; + +struct GPUGemmGeneralOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__buddyir_gpu_tile_gemm")}; +}; + +using namespace mlir; + +namespace { + +constexpr StringRef getLinalgToGPUAttrName() { return "__buddyir_to_gpu__"; } + +constexpr StringRef getLinalgTargetAttrName() { return "__buddyir_target__"; } + +template +void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, + Args &&...args) { + if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() && + pm.getOpAnchorName() != OpClass::getOperationName()) { + if (pm.getNesting() == OpPassManager::Nesting::Implicit) { + builder(pm.nest(), std::forward(args)...); + return; + } + llvm::report_fatal_error( + llvm::Twine("Can't build pass pipeline on expected op type ") + + OpClass::getOperationName() + " but got " + pm.getOpAnchorName()); + } else { + builder(pm, std::forward(args)...); + } +} + +void createGPUTileGemmTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (!isLinalgOpMatmul(op)) + return false; + return true; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + func::FuncOp funcOp = op->getParentOfType(); + linalg::LinalgOp linalgOp = cast(op); + Operation *user = *linalgOp->getUsers().begin(); + bool hasEpilogue = isa(user); + + if (hasEpilogue) { + setMarker(user, getEpilogueMarker()); + } + + bool isBMM = linalgOp.getNumParallelLoops() == 3; + + SmallVector tileSizeConfig = getGemmTileSize(funcOp).value(); + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ false, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + auto anyType = transform::AnyOpType::get(b.getContext()); + auto linalgFillType = transform::OperationType::get( + b.getContext(), linalg::FillOp::getOperationName()); + auto linalgFill = b.create( + linalgFillType, func, linalg::FillOp::getOperationName()); + + Value mmaLevel = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ b.getStringAttr("Threadblock")); + Value target = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ b.getStringAttr("nv_sm_80")); + + SmallVector mappingIdx; + if (isBMM) { + mappingIdx = {2, 1, 0}; + } else { + mappingIdx = {1, 0}; + } + auto mapping = llvm::to_vector(llvm::map_range( + mappingIdx, [](int64_t i) { return static_cast(i); })); + auto mappingAttrs = llvm::to_vector( + llvm::map_range(mapping, [&](gpu::MappingId dim) -> Attribute { + return gpu::GPUBlockMappingAttr::get(b.getContext(), dim); + })); + + SmallVector parrallelTileSizes; + if (isBMM) { + parrallelTileSizes = {1, tileSizeConfig[0], tileSizeConfig[1]}; + } else { + parrallelTileSizes = {tileSizeConfig[0], tileSizeConfig[1]}; + } + Value tiledMatmulOp; + if (hasEpilogue) { + auto linalgGenericType = transform::OperationType::get( + b.getContext(), linalg::GenericOp::getOperationName()); + auto epilogue = b.create( + linalgGenericType, func, + b.getStrArrayAttr({linalg::GenericOp::getOperationName()}), + /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(), + /*opAttrs=*/ + b.getDictionaryAttr({NamedAttribute( + b.getStringAttr(getEpilogueMarker()), b.getUnitAttr())}), + /*filterResultType=*/TypeAttr(), + /*filterOperandTYpes=*/ArrayAttr()); + + transform::TileUsingForallOp tileOp = + b.create( + /* target */ epilogue, + /* staticTileSizes */ parrallelTileSizes, + /* ctor tag */ transform::TileSizesSpec(), + /* mapping */ b.getArrayAttr(mappingAttrs)); + transform::FuseIntoContainingOp fuse = + b.create( + /* producerOp */ pdlV, + /* containingOp */ tileOp.getForallOp()); + b.create( + /* producerOp */ linalgFill, + /* containingOp */ fuse.getNewContainingOp()); + tiledMatmulOp = fuse.getFusedOp(); + } else { + transform::TileUsingForallOp tileOp = + b.create( + /* target */ pdlV, + /* staticTileSizes */ parrallelTileSizes, + /* ctor tag */ transform::TileSizesSpec(), + /* mapping */ b.getArrayAttr(mappingAttrs)); + + b.create( + /* producerOp */ linalgFill, + /* containingOp */ tileOp.getForallOp()); + tiledMatmulOp = tileOp.getTiledOp(); + } + + SmallVector reductionTileSizes; + if (isBMM) + reductionTileSizes = {0, 0, 0, tileSizeConfig[2]}; + else + reductionTileSizes = {0, 0, tileSizeConfig[2]}; + auto tileKMatmulOp = + b.create(tiledMatmulOp, reductionTileSizes); + auto matmulKOp = tileKMatmulOp.getTiledLinalgOp(); + auto forLoops = tileKMatmulOp.getLoops(); + if (!forLoops.empty()) { + b.create(forLoops[0], getMatmulMainLoopMarker(), + Value()); + } else { + b.create(matmulKOp, getMatmulMainLoopMarker(), + Value()); + } + + b.create(matmulKOp, getLinalgMMALevelAttrName(), + mmaLevel); + b.create(matmulKOp, getLinalgTargetAttrName(), + target); + b.create(matmulKOp, getMMAPatternAttrName(), + Value()); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +} // namespace + +void mlir::createGPUTileGemmTransform(OpPassManager &pm, + const GPUGemmGeneralOptions &options) { + invokeOpPassPipelineBuilder(createGPUTileGemmTransformImpl, pm, + options.funcAnchor, options.annotatePrefix); +} + +namespace { + +void createGPUAddGemmCodegenLoweringConfigTransformImpl( + OpPassManager &pm, const std::string &anchor, const std::string &prefix, + ArrayRef tileSizeConfig, ArrayRef workgroupSize, + int64_t stages) { + + SmallVector tileSizeConfigVec{tileSizeConfig}; + SmallVector workgroupSizeVec{workgroupSize}; + + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + + config.opFilter = [=](Operation *op) { + if (isLinalgOpMatmul(op)) { + // TODO: check if the matmul op is already annotated + // TODO: Add different lowering config for different matmul op size + return true; + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + // auto linalgOp = llvm::cast(op); + auto tileSizeConfigAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(tileSizeConfigVec, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto workgroupSizeAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(workgroupSizeVec, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto stagesAttr = b.getI64IntegerAttr(stages); + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ true, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + Value tileSizeConfigValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ tileSizeConfigAttrs); + Value workgroupSizeValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ workgroupSizeAttrs); + Value stagesValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ stagesAttr); + + b.create(func, getGemmTileConfigAttrName(), + tileSizeConfigValue); + b.create(func, getGemmBlockSizeAttrName(), + workgroupSizeValue); + b.create(func, getGemmPipelineDepthAttrName(), + stagesValue); + }; + pm.addPass(createGenericTransformInsertionPass(config)); +} +} // namespace + +void mlir::createGPUAddGemmCodegenLoweringConfigTransform( + OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) { + invokeOpPassPipelineBuilder( + createGPUAddGemmCodegenLoweringConfigTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, options.tileSizeConfig, + options.workgroupSize, options.stages); +} + +} \ No newline at end of file diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt index 8ab8f29f58..304de8570c 100644 --- a/midend/lib/Dialect/CMakeLists.txt +++ b/midend/lib/Dialect/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(RVV) add_subdirectory(VectorExp) add_subdirectory(Gemmini) add_subdirectory(Sche) +add_subdirectory(Transform) diff --git a/midend/lib/Dialect/Transform/CMakeLists.txt b/midend/lib/Dialect/Transform/CMakeLists.txt new file mode 100644 index 0000000000..5c919f7dfc --- /dev/null +++ b/midend/lib/Dialect/Transform/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Transforms) \ No newline at end of file diff --git a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt new file mode 100644 index 0000000000..ac0fe3b197 --- /dev/null +++ b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_dialect_library(BuddyTransformPasses + TransformDialectInterpreter.cpp + TransformInsertion.cpp + + DEPENDS + BuddyTransformPassIncGen +# MLIRLinalgExtTransformOps +# MLIRTransformExtOpsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass + MLIRPDLDialect + MLIRTransformDialect +# MLIRLinalgExtTransformOps +) diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp new file mode 100644 index 0000000000..24f81e5a89 --- /dev/null +++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp @@ -0,0 +1,45 @@ +#define GEN_PASS_DEF_TRANSFORMDIALECTINTERPRETER +#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" +#include "Transform/Passes.h" +#include "mlir/IR/BuiltinOps.h" + +using namespace mlir; + +namespace { + +struct TransformDialectInterpreterPass + : public impl::TransformDialectInterpreterBase { + + explicit TransformDialectInterpreterPass(bool erase) + : TransformDialectInterpreterBase() { + eraseAfter = erase; + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + for (auto op : module.getOps()) { + RaggedArray extraMappings; + if (failed(transform::applyTransforms( + module, op, extraMappings, + transform::TransformOptions().enableExpensiveChecks(false)))) { + return signalPassFailure(); + } + } + if (eraseAfter) { + module.walk([&](Operation *nestedOp) { + if (isa(nestedOp)) { + nestedOp->erase(); + return WalkResult::skip(); + } + return WalkResult::advance(); + }); + } + } +}; + +} // namespace + +std::unique_ptr> +createTransformDialectInterpreter(bool eraseAfter = false) { + return std::make_unique(eraseAfter); +} \ No newline at end of file diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp new file mode 100644 index 0000000000..6595b8347c --- /dev/null +++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -0,0 +1,88 @@ +#include "Transform/Passes.h" + +#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" + +#include +#include + +using namespace mlir; +using namespace llvm; + +namespace { + +void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, + const TransformInsertionConfig &config) { + funcOp->walk([&](Operation *op) { + if (config.opFilter(op)) { + ImplicitLocOpBuilder b(op->getLoc(), builder); + MLIRContext *ctx = b.getContext(); + + auto annotation = getAnnotationUniqueIdentifier(config.matchPrefix); + op->setAttr(annotation, UnitAttr::get(ctx)); + + auto pdlOperationType = pdl::OperationType::get(ctx); + b.create( + TypeRange(), transform::FailurePropagationMode::Propagate, pdlOperationType, + [&](OpBuilder &b, Location loc, Value blockArg) { + auto annotationAttr = DictionaryAttr::get( + ctx, b.getNamedAttr(annotation, UnitAttr::get(ctx))); + auto match = b.create( + loc, blockArg.getType(), blockArg, ArrayAttr(), + transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr(), + ArrayAttr()); + ImplicitLocOpBuilder ib(loc, b); + config.transformBuilder(ib, op, match); + b.create(loc); + }); + } + }); +} + +void insertTransformIR(ModuleOp module, const TransformInsertionConfig &config) { + OpBuilder builder = OpBuilder::atBlockEnd(module.getBody()); + for (auto funcOp : module.getOps()) { + if (!config.funcAnchor.empty() && !funcOp->hasAttr(config.funcAnchor)) { + continue; + } + insertTransformIR(funcOp, builder, config); + } +} + +struct GenericTransformInsertionPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GenericTransformInsertionPass) + + GenericTransformInsertionPass(const TransformInsertionConfig &config) : config(config) {} + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + insertTransformIR(getOperation(), config); + } + +protected: + TransformInsertionConfig config; +}; + +} // namespace + +std::unique_ptr> +mlir::createGenericTransformInsertionPass(const TransformInsertionConfig &config) { + return std::make_unique(config); +} \ No newline at end of file From 7a18985508eb4815582f3241e6e95424eae70ff9 Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Wed, 21 Aug 2024 09:49:54 +0000 Subject: [PATCH 2/7] fix bug --- midend/include/Dialect/Transform/Passes.h | 10 +- midend/include/Dialect/Transform/Passes.td | 2 +- .../Transforms/TransformDialectInterpreter.h | 1 + .../Transform/Transforms/TransformInsertion.h | 1 + .../MatMulOptimization/CMakeLists.txt | 4 +- .../MatMulOptimization/GPU/CMakeLists.txt | 11 - .../GPU/MatmulTilingOpt.cpp | 297 ------------------ .../Dialect/Transform/Transforms/PassDetail.h | 13 + .../TransformDialectInterpreter.cpp | 14 +- .../Transforms/TransformInsertion.cpp | 16 +- 10 files changed, 42 insertions(+), 327 deletions(-) delete mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt delete mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp create mode 100644 midend/lib/Dialect/Transform/Transforms/PassDetail.h diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h index 1c11fc4010..ae7bc8b394 100644 --- a/midend/include/Dialect/Transform/Passes.h +++ b/midend/include/Dialect/Transform/Passes.h @@ -1,16 +1,16 @@ -#pragma once #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassRegistry.h" + +// Include the constructor of passes in Transform Dialect +#include "Transform/Transforms/TransformDialectInterpreter.h" +#include "Transform/Transforms/TransformInsertion.h" + namespace mlir { class ModuleOp; // Generate the definition of Transform Passes #define GEN_PASS_DECL #include "Transform/Passes.h.inc" -// Include the constructor of passes in Transform Dialect -#include "Transform/Transforms/TransformDialectInterpreter.h" -#include "Transform/Transforms/TransformInsertion.h" - #define GEN_PASS_REGISTRATION #include "Transform/Passes.h.inc" diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td index f3790312ce..104523d2e8 100644 --- a/midend/include/Dialect/Transform/Passes.td +++ b/midend/include/Dialect/Transform/Passes.td @@ -1,7 +1,7 @@ include "mlir/Pass/PassBase.td" // TransformDialectInterpreter -def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> { +def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "mlir::ModuleOp"> { let summary = "Apply transform dialect operations one by one"; let constructor = "mlir::createTransformDialectInterpreter()"; let options = [ diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h index d8038c59ec..fcd54ba8c1 100644 --- a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h +++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h @@ -2,6 +2,7 @@ #include namespace mlir { + class ModuleOp; std::unique_ptr> diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h index aca6a50177..22ad91c09e 100644 --- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h +++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h @@ -3,6 +3,7 @@ #include namespace mlir { + class ModuleOp; class ImplicitLocOpBuilder; diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt index 69405c053e..c8695e600d 100644 --- a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt +++ b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt @@ -13,6 +13,4 @@ add_mlir_library(BatchMatMulOptimization add_mlir_library(MatMulParallelVectorization MatMulParallelVectorization.cpp -) - -add_subdirectory(GPU) \ No newline at end of file +) \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt deleted file mode 100644 index 03150ba4b0..0000000000 --- a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -add_mlir_library(BuddyIRGPUMatmulOpt - DEPENDS - MhloDialect - MLIRBufferTransforms - - LINK_LIBS PUBLIC - MLIRIR - MLIRTransforms - MLIRLinalgExtTransformOps - BuddyUtils -) \ No newline at end of file diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp deleted file mode 100644 index 11daea6998..0000000000 --- a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp +++ /dev/null @@ -1,297 +0,0 @@ -#include "mlir/Pass/PassManager.h" -#include "mlir/Pass/PassOptions.h" -#include "mlir/Pass/PassRegistry.h" - -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" -#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" -#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/BuiltinOps.h" -#include "llvm/ADT/SmallSet.h" - -#include - -namespace mlir { - -struct GPUGemmCodegenConfigOptions - : public PassPipelineOptions { - Option funcAnchor{ - *this, "func-anchor", - llvm::cl::desc( - "An optional Unit attribute anchoring on target functions."), - llvm::cl::init("")}; - Option annotatePrefix{ - *this, "annotate-prefix", - llvm::cl::desc("An optional annotate prefix attribute on target ops."), - llvm::cl::init("__buddyir_gpu_tile_gemm")}; - ListOption tileSizeConfig{ - *this, "tile-size-config", - llvm::cl::desc("An optional tile size config for tile matmul op.")}; - ListOption workgroupSize{ - *this, "workgroup-size", - llvm::cl::desc("An optional workgroup size config for tile matmul op.")}; - Option stages{ - *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."), - llvm::cl::init(3)}; -}; - -struct GPUGemmGeneralOptions - : public PassPipelineOptions { - Option funcAnchor{ - *this, "func-anchor", - llvm::cl::desc( - "An optional Unit attribute anchoring on target functions."), - llvm::cl::init("")}; - Option annotatePrefix{ - *this, "annotate-prefix", - llvm::cl::desc("An optional annotate prefix attribute on target ops."), - llvm::cl::init("__buddyir_gpu_tile_gemm")}; -}; - -using namespace mlir; - -namespace { - -constexpr StringRef getLinalgToGPUAttrName() { return "__buddyir_to_gpu__"; } - -constexpr StringRef getLinalgTargetAttrName() { return "__buddyir_target__"; } - -template -void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, - Args &&...args) { - if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() && - pm.getOpAnchorName() != OpClass::getOperationName()) { - if (pm.getNesting() == OpPassManager::Nesting::Implicit) { - builder(pm.nest(), std::forward(args)...); - return; - } - llvm::report_fatal_error( - llvm::Twine("Can't build pass pipeline on expected op type ") + - OpClass::getOperationName() + " but got " + pm.getOpAnchorName()); - } else { - builder(pm, std::forward(args)...); - } -} - -void createGPUTileGemmTransformImpl(OpPassManager &pm, - const std::string &anchor, - const std::string &prefix) { - TransformInsertionConfig config; - config.funcAnchor = anchor; - config.matchPrefix = prefix; - config.opFilter = [=](Operation *op) { - if (!isLinalgOpMatmul(op)) - return false; - return true; - }; - - config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, - Value pdlV) { - func::FuncOp funcOp = op->getParentOfType(); - linalg::LinalgOp linalgOp = cast(op); - Operation *user = *linalgOp->getUsers().begin(); - bool hasEpilogue = isa(user); - - if (hasEpilogue) { - setMarker(user, getEpilogueMarker()); - } - - bool isBMM = linalgOp.getNumParallelLoops() == 3; - - SmallVector tileSizeConfig = getGemmTileSize(funcOp).value(); - - auto func = b.create( - pdlV.getType(), pdlV, - /* isolated_from_above */ false, - /* allow_empty_results */ false, - /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), - /* deduplicate */ false, - /* nth_parent */ 1); - - auto anyType = transform::AnyOpType::get(b.getContext()); - auto linalgFillType = transform::OperationType::get( - b.getContext(), linalg::FillOp::getOperationName()); - auto linalgFill = b.create( - linalgFillType, func, linalg::FillOp::getOperationName()); - - Value mmaLevel = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ b.getStringAttr("Threadblock")); - Value target = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ b.getStringAttr("nv_sm_80")); - - SmallVector mappingIdx; - if (isBMM) { - mappingIdx = {2, 1, 0}; - } else { - mappingIdx = {1, 0}; - } - auto mapping = llvm::to_vector(llvm::map_range( - mappingIdx, [](int64_t i) { return static_cast(i); })); - auto mappingAttrs = llvm::to_vector( - llvm::map_range(mapping, [&](gpu::MappingId dim) -> Attribute { - return gpu::GPUBlockMappingAttr::get(b.getContext(), dim); - })); - - SmallVector parrallelTileSizes; - if (isBMM) { - parrallelTileSizes = {1, tileSizeConfig[0], tileSizeConfig[1]}; - } else { - parrallelTileSizes = {tileSizeConfig[0], tileSizeConfig[1]}; - } - Value tiledMatmulOp; - if (hasEpilogue) { - auto linalgGenericType = transform::OperationType::get( - b.getContext(), linalg::GenericOp::getOperationName()); - auto epilogue = b.create( - linalgGenericType, func, - b.getStrArrayAttr({linalg::GenericOp::getOperationName()}), - /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(), - /*opAttrs=*/ - b.getDictionaryAttr({NamedAttribute( - b.getStringAttr(getEpilogueMarker()), b.getUnitAttr())}), - /*filterResultType=*/TypeAttr(), - /*filterOperandTYpes=*/ArrayAttr()); - - transform::TileUsingForallOp tileOp = - b.create( - /* target */ epilogue, - /* staticTileSizes */ parrallelTileSizes, - /* ctor tag */ transform::TileSizesSpec(), - /* mapping */ b.getArrayAttr(mappingAttrs)); - transform::FuseIntoContainingOp fuse = - b.create( - /* producerOp */ pdlV, - /* containingOp */ tileOp.getForallOp()); - b.create( - /* producerOp */ linalgFill, - /* containingOp */ fuse.getNewContainingOp()); - tiledMatmulOp = fuse.getFusedOp(); - } else { - transform::TileUsingForallOp tileOp = - b.create( - /* target */ pdlV, - /* staticTileSizes */ parrallelTileSizes, - /* ctor tag */ transform::TileSizesSpec(), - /* mapping */ b.getArrayAttr(mappingAttrs)); - - b.create( - /* producerOp */ linalgFill, - /* containingOp */ tileOp.getForallOp()); - tiledMatmulOp = tileOp.getTiledOp(); - } - - SmallVector reductionTileSizes; - if (isBMM) - reductionTileSizes = {0, 0, 0, tileSizeConfig[2]}; - else - reductionTileSizes = {0, 0, tileSizeConfig[2]}; - auto tileKMatmulOp = - b.create(tiledMatmulOp, reductionTileSizes); - auto matmulKOp = tileKMatmulOp.getTiledLinalgOp(); - auto forLoops = tileKMatmulOp.getLoops(); - if (!forLoops.empty()) { - b.create(forLoops[0], getMatmulMainLoopMarker(), - Value()); - } else { - b.create(matmulKOp, getMatmulMainLoopMarker(), - Value()); - } - - b.create(matmulKOp, getLinalgMMALevelAttrName(), - mmaLevel); - b.create(matmulKOp, getLinalgTargetAttrName(), - target); - b.create(matmulKOp, getMMAPatternAttrName(), - Value()); - }; - - pm.addPass(createGenericTransformInsertionPass(config)); -} - -} // namespace - -void mlir::createGPUTileGemmTransform(OpPassManager &pm, - const GPUGemmGeneralOptions &options) { - invokeOpPassPipelineBuilder(createGPUTileGemmTransformImpl, pm, - options.funcAnchor, options.annotatePrefix); -} - -namespace { - -void createGPUAddGemmCodegenLoweringConfigTransformImpl( - OpPassManager &pm, const std::string &anchor, const std::string &prefix, - ArrayRef tileSizeConfig, ArrayRef workgroupSize, - int64_t stages) { - - SmallVector tileSizeConfigVec{tileSizeConfig}; - SmallVector workgroupSizeVec{workgroupSize}; - - TransformInsertionConfig config; - config.funcAnchor = anchor; - config.matchPrefix = prefix; - - config.opFilter = [=](Operation *op) { - if (isLinalgOpMatmul(op)) { - // TODO: check if the matmul op is already annotated - // TODO: Add different lowering config for different matmul op size - return true; - } - return false; - }; - - config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, - Value pdlV) { - // auto linalgOp = llvm::cast(op); - auto tileSizeConfigAttrs = b.getAttr(llvm::to_vector( - llvm::map_range(tileSizeConfigVec, [&](int64_t i) -> Attribute { - return b.getI64IntegerAttr(i); - }))); - auto workgroupSizeAttrs = b.getAttr(llvm::to_vector( - llvm::map_range(workgroupSizeVec, [&](int64_t i) -> Attribute { - return b.getI64IntegerAttr(i); - }))); - auto stagesAttr = b.getI64IntegerAttr(stages); - - auto func = b.create( - pdlV.getType(), pdlV, - /* isolated_from_above */ true, - /* allow_empty_results */ false, - /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), - /* deduplicate */ false, - /* nth_parent */ 1); - - Value tileSizeConfigValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ tileSizeConfigAttrs); - Value workgroupSizeValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ workgroupSizeAttrs); - Value stagesValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ stagesAttr); - - b.create(func, getGemmTileConfigAttrName(), - tileSizeConfigValue); - b.create(func, getGemmBlockSizeAttrName(), - workgroupSizeValue); - b.create(func, getGemmPipelineDepthAttrName(), - stagesValue); - }; - pm.addPass(createGenericTransformInsertionPass(config)); -} -} // namespace - -void mlir::createGPUAddGemmCodegenLoweringConfigTransform( - OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) { - invokeOpPassPipelineBuilder( - createGPUAddGemmCodegenLoweringConfigTransformImpl, pm, - options.funcAnchor, options.annotatePrefix, options.tileSizeConfig, - options.workgroupSize, options.stages); -} - -} \ No newline at end of file diff --git a/midend/lib/Dialect/Transform/Transforms/PassDetail.h b/midend/lib/Dialect/Transform/Transforms/PassDetail.h new file mode 100644 index 0000000000..0c4840f1d6 --- /dev/null +++ b/midend/lib/Dialect/Transform/Transforms/PassDetail.h @@ -0,0 +1,13 @@ +#ifndef TRANSFORM_PASSDETAIL_H +#define TRANSFORM_PASSDETAIL_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { + +#define GEN_PASS_CLASSES +#include "Transform/Passes.h.inc" + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp index 24f81e5a89..2890724998 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp @@ -1,15 +1,17 @@ -#define GEN_PASS_DEF_TRANSFORMDIALECTINTERPRETER -#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" -#include "Transform/Passes.h" +#include "Transform/Transforms/TransformDialectInterpreter.h" + +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" #include "mlir/IR/BuiltinOps.h" +#include "PassDetail.h" + using namespace mlir; namespace { struct TransformDialectInterpreterPass - : public impl::TransformDialectInterpreterBase { - + : public TransformDialectInterpreterBase { + explicit TransformDialectInterpreterPass(bool erase) : TransformDialectInterpreterBase() { eraseAfter = erase; @@ -40,6 +42,6 @@ struct TransformDialectInterpreterPass } // namespace std::unique_ptr> -createTransformDialectInterpreter(bool eraseAfter = false) { +mlir::createTransformDialectInterpreter(bool eraseAfter) { return std::make_unique(eraseAfter); } \ No newline at end of file diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp index 6595b8347c..1e936d8f9b 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -1,7 +1,7 @@ -#include "Transform/Passes.h" +#include "Transform/Transforms/TransformInsertion.h" -#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/PDL/IR/PDLOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h" #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" @@ -16,6 +16,8 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" +#include "PassDetail.h" + #include #include @@ -24,6 +26,12 @@ using namespace llvm; namespace { +inline std::string getAnnotationUniqueIdentifier(const std::string matchPrefix) +{ + static size_t cnt = 0; + return matchPrefix + "_" + std::to_string(cnt++); +} + void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, const TransformInsertionConfig &config) { funcOp->walk([&](Operation *op) { @@ -42,8 +50,8 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, ctx, b.getNamedAttr(annotation, UnitAttr::get(ctx))); auto match = b.create( loc, blockArg.getType(), blockArg, ArrayAttr(), - transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr(), - ArrayAttr()); + transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr() + /*ArrayAttr()*/); ImplicitLocOpBuilder ib(loc, b); config.transformBuilder(ib, op, match); b.create(loc); From ef66238ef0282e3a88f76a9f8e78551539a6b1e6 Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Wed, 21 Aug 2024 11:24:40 +0000 Subject: [PATCH 3/7] fix Transform Transform Pass --- midend/include/Dialect/Transform/Passes.td | 2 +- .../Dialect/Transform/Transforms/TransformInsertion.h | 2 +- midend/lib/Dialect/Transform/Transforms/CMakeLists.txt | 3 --- .../Transform/Transforms/TransformDialectInterpreter.cpp | 1 - .../Dialect/Transform/Transforms/TransformInsertion.cpp | 7 +++---- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td index 104523d2e8..f3790312ce 100644 --- a/midend/include/Dialect/Transform/Passes.td +++ b/midend/include/Dialect/Transform/Passes.td @@ -1,7 +1,7 @@ include "mlir/Pass/PassBase.td" // TransformDialectInterpreter -def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "mlir::ModuleOp"> { +def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> { let summary = "Apply transform dialect operations one by one"; let constructor = "mlir::createTransformDialectInterpreter()"; let options = [ diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h index 22ad91c09e..4d30289bcb 100644 --- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h +++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h @@ -12,7 +12,7 @@ struct TransformInsertionConfig { std::string matchPrefix; std::function opFilter; std::function transformBuilder; -} +}; std::unique_ptr> createGenericTransformInsertionPass(const TransformInsertionConfig &config); diff --git a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt index ac0fe3b197..9624c9f651 100644 --- a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt +++ b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt @@ -4,13 +4,10 @@ add_mlir_dialect_library(BuddyTransformPasses DEPENDS BuddyTransformPassIncGen -# MLIRLinalgExtTransformOps -# MLIRTransformExtOpsIncGen LINK_LIBS PUBLIC MLIRIR MLIRPass MLIRPDLDialect MLIRTransformDialect -# MLIRLinalgExtTransformOps ) diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp index 2890724998..406f5f70f3 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp @@ -1,5 +1,4 @@ #include "Transform/Transforms/TransformDialectInterpreter.h" - #include "mlir/Dialect/Transform/IR/TransformInterfaces.h" #include "mlir/IR/BuiltinOps.h" diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp index 1e936d8f9b..f54e6d9a09 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -1,5 +1,3 @@ -#include "Transform/Transforms/TransformInsertion.h" - #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/PDL/IR/PDLOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" @@ -17,6 +15,7 @@ #include "mlir/IR/Operation.h" #include "PassDetail.h" +#include "Transform/Transforms/TransformInsertion.h" #include #include @@ -91,6 +90,6 @@ struct GenericTransformInsertionPass } // namespace std::unique_ptr> -mlir::createGenericTransformInsertionPass(const TransformInsertionConfig &config) { +createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) { return std::make_unique(config); -} \ No newline at end of file +} From 50a55a96fc63ebbf81a546ce56daa5dddab39c2e Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Thu, 29 Aug 2024 10:06:32 +0800 Subject: [PATCH 4/7] fix bug --- CMakeLists.txt | 1 + examples/BuddyMatmul/linalg-matmul.mlir | 21 ++++ midend/include/CMakeLists.txt | 2 +- midend/include/Dialect/GPU/CMakeLists.txt | 0 midend/include/Dialect/GPU/Transforms/Utils.h | 22 +++++ .../Transforms/TransformDialectInterpreter.h | 7 +- .../Transform/Transforms/TransformInsertion.h | 7 +- .../Pipelines/GPU/GemmCodegenTransform.h | 36 +++++++ midend/include/Pipelines/GPU/Utils.h | 32 ++++++ midend/include/Pipelines/LinalgTensorOpt.h | 31 ++++++ midend/lib/CMakeLists.txt | 1 + midend/lib/Dialect/GPU/CMakeLists.txt | 0 .../Transforms/TransformInsertion.cpp | 11 ++- midend/lib/Pipelines/CMakeLists.txt | 10 ++ midend/lib/Pipelines/GPU/CMakeLists.txt | 11 +++ .../Pipelines/GPU/GemmCodegenTransform.cpp | 98 +++++++++++++++++++ midend/lib/Pipelines/GPU/Utils.cpp | 52 ++++++++++ midend/lib/Pipelines/LinalgTensorOpt.cpp | 48 +++++++++ tools/buddy-opt/CMakeLists.txt | 3 + tools/buddy-opt/buddy-opt.cpp | 5 + 20 files changed, 392 insertions(+), 6 deletions(-) create mode 100644 examples/BuddyMatmul/linalg-matmul.mlir create mode 100644 midend/include/Dialect/GPU/CMakeLists.txt create mode 100644 midend/include/Dialect/GPU/Transforms/Utils.h create mode 100644 midend/include/Pipelines/GPU/GemmCodegenTransform.h create mode 100644 midend/include/Pipelines/GPU/Utils.h create mode 100644 midend/include/Pipelines/LinalgTensorOpt.h create mode 100644 midend/lib/Dialect/GPU/CMakeLists.txt create mode 100644 midend/lib/Pipelines/CMakeLists.txt create mode 100644 midend/lib/Pipelines/GPU/CMakeLists.txt create mode 100644 midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp create mode 100644 midend/lib/Pipelines/GPU/Utils.cpp create mode 100644 midend/lib/Pipelines/LinalgTensorOpt.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4863467449..0f440738c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ project(buddy-mlir LANGUAGES CXX C) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED YES) +set(CMAKE_BUILD_TYPE Debug) include(ExternalProject) #------------------------------------------------------------------------------- diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir new file mode 100644 index 0000000000..5fbfd77013 --- /dev/null +++ b/examples/BuddyMatmul/linalg-matmul.mlir @@ -0,0 +1,21 @@ +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +module attributes {} { + func.func private @Unknown0(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} { + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<1280x1280xf16> + %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16> + %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1280x32xf16>, tensor<32x1280xf16>) outs(%1 : tensor<1280x1280xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %3 = arith.mulf %in, %in_0 : f16 + %4 = arith.addf %out, %3 : f16 + linalg.yield %4 : f16 + } -> tensor<1280x1280xf16> + return %2 : tensor<1280x1280xf16> + } + func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> { + %0 = call @Unknown0(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16> + return %0 : tensor<1280x1280xf16> + } +} diff --git a/midend/include/CMakeLists.txt b/midend/include/CMakeLists.txt index 0ca0f41c5a..e6f347c8c1 100644 --- a/midend/include/CMakeLists.txt +++ b/midend/include/CMakeLists.txt @@ -1 +1 @@ -add_subdirectory(Dialect) +add_subdirectory(Dialect) \ No newline at end of file diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/midend/include/Dialect/GPU/Transforms/Utils.h b/midend/include/Dialect/GPU/Transforms/Utils.h new file mode 100644 index 0000000000..5824dfb5b3 --- /dev/null +++ b/midend/include/Dialect/GPU/Transforms/Utils.h @@ -0,0 +1,22 @@ +#ifndef GPU_TRANSFORMS_UTILS_H +#define GPU_TRANSFORMS_UTILS_H + +#include "mlir/Dialect/Linalg/Utils/Utils.h" + +namespace mlir { + +static constexpr StringRef getGemmTileConfigAttrName() { + return "__buddy_gemm_tile_config__"; +} + +static constexpr StringRef getGemmBlockSizeAttrName() { + return "__buddy_gemm_block_size__"; +} + +static constexpr StringRef getGemmPipelineStageAttrName() { + return "__buddy_gemm_pipeline_stage__"; +} + +} + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h index fcd54ba8c1..d457a1bbe7 100644 --- a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h +++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h @@ -1,3 +1,6 @@ +#ifndef TRANSFORM_TRANSFORMS_TRANSFORMDIALECTINTERPRETER_H +#define TRANSFORM_TRANSFORMS_TRANSFORMDIALECTINTERPRETER_H + #include "mlir/Pass/Pass.h" #include @@ -8,4 +11,6 @@ class ModuleOp; std::unique_ptr> createTransformDialectInterpreter(bool eraseAfter = false); -} // namespace mlir \ No newline at end of file +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h index 4d30289bcb..099bea6a38 100644 --- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h +++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h @@ -1,3 +1,6 @@ +#ifndef TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H +#define TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H + #include "mlir/Pass/Pass.h" #include #include @@ -17,4 +20,6 @@ struct TransformInsertionConfig { std::unique_ptr> createGenericTransformInsertionPass(const TransformInsertionConfig &config); -} \ No newline at end of file +} //namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/GPU/GemmCodegenTransform.h b/midend/include/Pipelines/GPU/GemmCodegenTransform.h new file mode 100644 index 0000000000..6bec6ec46e --- /dev/null +++ b/midend/include/Pipelines/GPU/GemmCodegenTransform.h @@ -0,0 +1,36 @@ +#ifndef PIPELINES_GPU_GEMM_CODEGEN_TRANSOFRM_H +#define PIPELINES_GPU_GEMM_CODEGEN_TRANSOFRM_H + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" + +namespace mlir { +namespace buddy { +struct GPUGemmCodegenConfigOptions : public PassPipelineOptions { + Option funcAnchor { + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix { + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__buddy_gpu_gemm__")}; + ListOption tileConfig { + *this, "tile-config", + llvm::cl::desc("An optional tile config for matmul op")}; + ListOption workGroup { + *this, "work-group", + llvm::cl::desc("An optional workgroup size config for matmul op")}; + Option stages { + *this, "stages", + llvm::cl::desc("An optional stages config for matmul op")}; +}; + +void createGPUGemmTileConfigInsertTransform(OpPassManager &pm, const GPUGemmCodegenConfigOptions &options); + +} // namespace mlir::buddy +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Pipelines/GPU/Utils.h new file mode 100644 index 0000000000..857b796618 --- /dev/null +++ b/midend/include/Pipelines/GPU/Utils.h @@ -0,0 +1,32 @@ +#ifndef PIPELINES_GPU_UTILS_H +#define PIPELINES_GPU_UTILS_H + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" + +namespace mlir { +class ModuleOp; +namespace buddy { +template +void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, + Args &&...args) { + if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() && + pm.getOpAnchorName() != OpClass::getOperationName()) { + if (pm.getNesting() == OpPassManager::Nesting::Implicit) { + builder(pm.nest(), std::forward(args)...); + return; + } + llvm::report_fatal_error( + llvm::Twine("Can't build pass pipeline on expected op type ") + + OpClass::getOperationName() + " but got " + pm.getOpAnchorName()); + } else { + builder(pm, std::forward(args)...); + } +} + +bool isLinalgMatmul(Operation *op); + +} // namespace buddy::pipelines +} // namespace buddy + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/LinalgTensorOpt.h b/midend/include/Pipelines/LinalgTensorOpt.h new file mode 100644 index 0000000000..d379f81cb3 --- /dev/null +++ b/midend/include/Pipelines/LinalgTensorOpt.h @@ -0,0 +1,31 @@ +#ifndef PIPELINES_GPU_LINALGTENSOROPT_H +#define PIPELINES_GPU_LINALGTENSOROPT_H + +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include +#include + +namespace mlir { +namespace buddy { + +struct LinalgTensorOptPipelineOptions + : public PassPipelineOptions { + Option target{ + *this, "target", + llvm::cl::desc("An optional attribute to speicify target."), + llvm::cl::init("gpu")}; + Option arch{ + *this, "arch", llvm::cl::desc("An optional attribute to speicify arch."), + llvm::cl::init("nv_sm_80")}; +}; + +void createLinalgTensorOptPassPipeline(OpPassManager &pm, const LinalgTensorOptPipelineOptions &options); + +void registerLinalgTensorOptPassPipeline(); + + +} // namespace mlir::buddy +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt index 19b254cf38..865af316e4 100644 --- a/midend/lib/CMakeLists.txt +++ b/midend/lib/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(Dialect) add_subdirectory(Conversion) add_subdirectory(Target) add_subdirectory(Utils) +add_subdirectory(Pipelines) # Build static library for async runtime. add_mlir_library(static_mlir_async_runtime diff --git a/midend/lib/Dialect/GPU/CMakeLists.txt b/midend/lib/Dialect/GPU/CMakeLists.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp index f54e6d9a09..7018174fb8 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -28,13 +28,14 @@ namespace { inline std::string getAnnotationUniqueIdentifier(const std::string matchPrefix) { static size_t cnt = 0; - return matchPrefix + "_" + std::to_string(cnt++); + return matchPrefix + std::to_string(cnt++); } void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, const TransformInsertionConfig &config) { - funcOp->walk([&](Operation *op) { + funcOp->walk([&](Operation *op) { if (config.opFilter(op)) { + op->print(llvm::errs()); ImplicitLocOpBuilder b(op->getLoc(), builder); MLIRContext *ctx = b.getContext(); @@ -51,10 +52,14 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, loc, blockArg.getType(), blockArg, ArrayAttr(), transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr() /*ArrayAttr()*/); + // debug + match->print(llvm::errs()); + llvm::errs() << "\n"; ImplicitLocOpBuilder ib(loc, b); config.transformBuilder(ib, op, match); b.create(loc); }); + funcOp->print(llvm::errs()); } }); } @@ -90,6 +95,6 @@ struct GenericTransformInsertionPass } // namespace std::unique_ptr> -createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) { +mlir::createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) { return std::make_unique(config); } diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt new file mode 100644 index 0000000000..ed97f0e647 --- /dev/null +++ b/midend/lib/Pipelines/CMakeLists.txt @@ -0,0 +1,10 @@ +add_subdirectory(GPU) + +add_mlir_library(BuddyPipelines + LinalgTensorOpt.cpp + GPU/Utils.cpp + LINK_LIBS PUBLIC + MLIRIR + BuddyGPUPipelines + BuddyTransformPasses +) \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/CMakeLists.txt b/midend/lib/Pipelines/GPU/CMakeLists.txt new file mode 100644 index 0000000000..1f5a151312 --- /dev/null +++ b/midend/lib/Pipelines/GPU/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_library(BuddyGPUPipelines + GemmCodegenTransform.cpp + Utils.cpp + + LINK_LIBS PUBLIC + MLIRIR + MLIRPDLDialect + MLIRTransformDialect + MLIRTransforms + BuddyTransformPasses +) \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp new file mode 100644 index 0000000000..8bca10bfd8 --- /dev/null +++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp @@ -0,0 +1,98 @@ +#include "Transform/Transforms/TransformInsertion.h" +#include "GPU/Transforms/Utils.h" +#include "Pipelines/GPU/GemmCodegenTransform.h" +#include "Pipelines/GPU/Utils.h" + +#include "mlir/Dialect/Transform/IR/TransformOps.h" +#include "mlir/Dialect/PDL/IR/PDLOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/SmallSet.h" + +#include + +using namespace mlir; + +namespace { + +void createGPUAddGemmCodegenLoweringConfigTransformImpl( + OpPassManager &pm, const std::string &anchor, const std::string &prefix, + ArrayRef tileConfig, ArrayRef workGroup, int64_t stages) { + + SmallVector vecTileConfig{tileConfig}; + SmallVector vecWorkGroup{workGroup}; + + TransformInsertionConfig config; + config.funcAnchor = anchor; + + config.matchPrefix = prefix; + + config.opFilter = [=](Operation *op){ + if (mlir::buddy::isLinalgMatmul(op)) { + return true; + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) { + auto tileConfigAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto workgroupAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto stagesAttr = b.getI64IntegerAttr(stages); + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ true, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + Value tileConfigValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ tileConfigAttrs + ); + + llvm::errs() << tileConfigValue << "\n"; + + Value workGroupValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ workgroupAttrs + ); + + Value stagesValue = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ stagesAttr + ); + + b.create(func, getGemmTileConfigAttrName(), + tileConfigValue); + b.create(func, getGemmBlockSizeAttrName(), + workGroupValue); + b.create(func, getGemmPipelineStageAttrName(), + stagesValue); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +} // namespace + +void mlir::buddy::createGPUGemmTileConfigInsertTransform( + OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) { + mlir::buddy::invokeOpPassPipelineBuilder( + createGPUAddGemmCodegenLoweringConfigTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, options.tileConfig, + options.workGroup, options.stages); +} \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Pipelines/GPU/Utils.cpp new file mode 100644 index 0000000000..7298195995 --- /dev/null +++ b/midend/lib/Pipelines/GPU/Utils.cpp @@ -0,0 +1,52 @@ +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" + +#include "Pipelines/GPU/Utils.h" + +using namespace mlir; +using namespace llvm; + +namespace mlir { +namespace buddy { + +bool isLinalgMatmul(Operation *op) { + if (!llvm::isa(op)) { + return false; + } + + linalg::LinalgOp linalgOp = cast(op); + if (isa(linalgOp) || isa(linalgOp)) { + return true; + } else { + if (!(linalg::isaContractionOpInterface(linalgOp) && + linalgOp.getNumParallelLoops() >= 2 && + linalgOp.getNumParallelLoops() <= 3)) { + return false; + } + Region &body = linalgOp->getRegion(0); + Region::OpIterator it = body.op_begin(); + while (it != body.op_end() && isa(*it)) + it++; + if (it == body.op_end() || !isa(*(it++))) + return false; + if (it == body.op_end() || !isa(*(it++))) + return false; + if (it == body.op_end() || !isa(*(it++))) + return false; + AffineMap outputMap = linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0)); + if (outputMap.getNumResults() != outputMap.getNumDims() - 1) + return false; + OpBuilder b(linalgOp); + for (unsigned i = 0; i < outputMap.getNumResults(); i++) { + if (outputMap.getResult(i) != b.getAffineDimExpr(i)) + return false; + } + return true; + } +} + +} // namespace mlir::buddy +} // namespace mlir \ No newline at end of file diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp new file mode 100644 index 0000000000..a0bce01fd3 --- /dev/null +++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp @@ -0,0 +1,48 @@ +#include "Pipelines/LinalgTensorOpt.h" +#include "Pipelines/GPU/Utils.h" +#include "Pipelines/GPU/GemmCodegenTransform.h" + +#include "Transform/Transforms/TransformDialectInterpreter.h" +#include "Transform/Transforms/TransformInsertion.h" + +#include "mlir/Dialect/Linalg/Passes.h" + +using namespace mlir; + +namespace { + +void addGPULinalgOptPasses(OpPassManager &pm) { + { // Gemm Codegen Linalg Tensor Opt + SmallVector tileConfig = {32, 32, 16}; + SmallVector workGroup = {32, 2, 1}; + int64_t stages = 3; + mlir::buddy::GPUGemmCodegenConfigOptions configOption; + configOption.tileConfig = tileConfig; + configOption.workGroup = workGroup; + configOption.stages = stages; + createGPUGemmTileConfigInsertTransform(pm, configOption); + pm.addPass(createTransformDialectInterpreter(true)); + } +} + +void createLinalgTensorOptPassPipelineimpl(OpPassManager &pm, + const std::string &target, + const std::string &arch) { + if (target == "gpu") { + addGPULinalgOptPasses(pm); + } else { + /*TODO*/ + } +} + +} // namespace + +void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm, + const LinalgTensorOptPipelineOptions &options) { + mlir::buddy::invokeOpPassPipelineBuilder(createLinalgTensorOptPassPipelineimpl, pm, options.target, options.arch); +} + +void mlir::buddy::registerLinalgTensorOptPassPipeline() { + PassPipelineRegistration( + "linalg-tensor-opt", "Linalg with Tensor Opt Pass Pipeline", mlir::buddy::createLinalgTensorOptPassPipeline); +} diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt index 24bcde9359..7a87da7f24 100644 --- a/tools/buddy-opt/CMakeLists.txt +++ b/tools/buddy-opt/CMakeLists.txt @@ -37,4 +37,7 @@ target_link_libraries(buddy-opt SchedulingOnDevices LowerSche FuncBufferizeDynamicOffset + BuddyPipelines + BuddyGPUPipelines + BuddyTransformPasses ) diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp index bea9513b5e..fc96aa8bae 100644 --- a/tools/buddy-opt/buddy-opt.cpp +++ b/tools/buddy-opt/buddy-opt.cpp @@ -47,6 +47,7 @@ #include "Gemmini/GemminiOps.h" #include "Sche/ScheDialect.h" #include "Sche/ScheOps.h" +#include "Pipelines/LinalgTensorOpt.h" namespace mlir { namespace buddy { @@ -71,6 +72,7 @@ void registerLowerLinalgToGemminiPass(); void registerDeviceSchedulePass(); void registerLowerSchePass(); void registerFuncBufferizeDynamicOffsetPass(); +void registerLinalgTensorOptPassPipeline(); } // namespace buddy } // namespace mlir @@ -104,6 +106,9 @@ int main(int argc, char **argv) { mlir::buddy::registerLowerSchePass(); mlir::buddy::registerFuncBufferizeDynamicOffsetPass(); + // Register Pipeline Passes + mlir::buddy::registerLinalgTensorOptPassPipeline(); + mlir::DialectRegistry registry; // Register all MLIR core dialects. registerAllDialects(registry); From b9e87d3ddc3d7795c51fcd236b74b17568c6fd7d Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Sat, 7 Sep 2024 14:30:39 +0800 Subject: [PATCH 5/7] add tiling pass of GPU GEMM Codegen --- examples/BuddyMatmul/linalg-matmul.mlir | 4 +- midend/include/Dialect/GPU/CMakeLists.txt | 0 midend/include/Dialect/GPU/Transforms/Utils.h | 22 -- .../Pipelines/GPU/GemmCodegenTransform.h | 19 +- midend/include/Pipelines/GPU/Utils.h | 57 +++++ .../Transforms/TransformInsertion.cpp | 6 +- .../Pipelines/GPU/GemmCodegenTransform.cpp | 213 ++++++++++++++++-- midend/lib/Pipelines/GPU/Utils.cpp | 57 +++++ midend/lib/Pipelines/LinalgTensorOpt.cpp | 16 +- 9 files changed, 336 insertions(+), 58 deletions(-) delete mode 100644 midend/include/Dialect/GPU/CMakeLists.txt delete mode 100644 midend/include/Dialect/GPU/Transforms/Utils.h diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir index 5fbfd77013..72a70d48aa 100644 --- a/examples/BuddyMatmul/linalg-matmul.mlir +++ b/examples/BuddyMatmul/linalg-matmul.mlir @@ -2,7 +2,7 @@ #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> module attributes {} { - func.func private @Unknown0(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} { + func.func private @Matmul(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} { %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1280x1280xf16> %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16> @@ -15,7 +15,7 @@ module attributes {} { return %2 : tensor<1280x1280xf16> } func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> { - %0 = call @Unknown0(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16> + %0 = call @Matmul(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16> return %0 : tensor<1280x1280xf16> } } diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/midend/include/Dialect/GPU/Transforms/Utils.h b/midend/include/Dialect/GPU/Transforms/Utils.h deleted file mode 100644 index 5824dfb5b3..0000000000 --- a/midend/include/Dialect/GPU/Transforms/Utils.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef GPU_TRANSFORMS_UTILS_H -#define GPU_TRANSFORMS_UTILS_H - -#include "mlir/Dialect/Linalg/Utils/Utils.h" - -namespace mlir { - -static constexpr StringRef getGemmTileConfigAttrName() { - return "__buddy_gemm_tile_config__"; -} - -static constexpr StringRef getGemmBlockSizeAttrName() { - return "__buddy_gemm_block_size__"; -} - -static constexpr StringRef getGemmPipelineStageAttrName() { - return "__buddy_gemm_pipeline_stage__"; -} - -} - -#endif \ No newline at end of file diff --git a/midend/include/Pipelines/GPU/GemmCodegenTransform.h b/midend/include/Pipelines/GPU/GemmCodegenTransform.h index 6bec6ec46e..5bb4f9432f 100644 --- a/midend/include/Pipelines/GPU/GemmCodegenTransform.h +++ b/midend/include/Pipelines/GPU/GemmCodegenTransform.h @@ -28,7 +28,24 @@ struct GPUGemmCodegenConfigOptions : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix { + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__buddy_gpu_gemm__")}; +}; + + +void createGemmTileConfigInsertTransform(OpPassManager &pm, const GPUGemmCodegenConfigOptions &options); + +void createGemmTileTransform(OpPassManager &pm, + const GPUGemmGeneralOptions &options); } // namespace mlir::buddy } // namespace mlir diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Pipelines/GPU/Utils.h index 857b796618..7821b53c31 100644 --- a/midend/include/Pipelines/GPU/Utils.h +++ b/midend/include/Pipelines/GPU/Utils.h @@ -6,7 +6,13 @@ namespace mlir { class ModuleOp; + +namespace func { +class FuncOp; +} // namespace mlir::func + namespace buddy { + template void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, Args &&...args) { @@ -26,6 +32,57 @@ void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, bool isLinalgMatmul(Operation *op); + +static constexpr StringRef getGemmTileMConfigAttrName() { + return "__buddy_gemm_tile_config__M"; +} + +static constexpr StringRef getGemmTileNConfigAttrName() { + return "__buddy_gemm_tile_config__N"; +} + +static constexpr StringRef getGemmTileKConfigAttrName() { + return "__buddy_gemm_tile_config__K"; +} + +static constexpr StringRef getGemmBlockXSizeAttrName() { + return "__buddy_gemm_block_size__X"; +} + +static constexpr StringRef getGemmBlockYSizeAttrName() { + return "__buddy_gemm_block_size__Y"; +} + +static constexpr StringRef getGemmBlockZSizeAttrName() { + return "__buddy_gemm_block_size__Z"; +} + +static constexpr StringRef getGemmPipelineStageAttrName() { + return "__buddy_gemm_pipeline_stage__"; +} + +static constexpr StringRef getMatmulKMainLoopMarker() { + return "__buddy_gemm_main_loopk__"; +} + +constexpr StringRef getLinalgMMALevelAttrName() { + return "__buddy_mma_level__"; +} + +constexpr StringRef getMMAPatternAttrName() { return "__buddy_mma__"; } + + +std::optional> getGemmTileSize(func::FuncOp funcOp); + +std::optional> getGemmBlockSize(func::FuncOp funcOp); + +std::optional getGemmPipelineStages(func::FuncOp funcOp); + +void setMarker(mlir::Operation *op, llvm::StringRef marker); + +bool hasMarker(Operation *op, StringRef marker); + + } // namespace buddy::pipelines } // namespace buddy diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp index 7018174fb8..ed6d6e4d59 100644 --- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp +++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -35,7 +35,6 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, const TransformInsertionConfig &config) { funcOp->walk([&](Operation *op) { if (config.opFilter(op)) { - op->print(llvm::errs()); ImplicitLocOpBuilder b(op->getLoc(), builder); MLIRContext *ctx = b.getContext(); @@ -52,14 +51,11 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, loc, blockArg.getType(), blockArg, ArrayAttr(), transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr() /*ArrayAttr()*/); - // debug - match->print(llvm::errs()); - llvm::errs() << "\n"; ImplicitLocOpBuilder ib(loc, b); + // pass op's handle to transform config.transformBuilder(ib, op, match); b.create(loc); }); - funcOp->print(llvm::errs()); } }); } diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp index 8bca10bfd8..0b3c8f3e4f 100644 --- a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp +++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp @@ -1,5 +1,4 @@ #include "Transform/Transforms/TransformInsertion.h" -#include "GPU/Transforms/Utils.h" #include "Pipelines/GPU/GemmCodegenTransform.h" #include "Pipelines/GPU/Utils.h" @@ -18,10 +17,11 @@ #include using namespace mlir; +using namespace mlir::buddy; namespace { -void createGPUAddGemmCodegenLoweringConfigTransformImpl( +void createAddGemmCodegenLoweringConfigTransformImpl( OpPassManager &pm, const std::string &anchor, const std::string &prefix, ArrayRef tileConfig, ArrayRef workGroup, int64_t stages) { @@ -30,24 +30,24 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl( TransformInsertionConfig config; config.funcAnchor = anchor; - config.matchPrefix = prefix; - + // transform operation takes effect needed to have this op config.opFilter = [=](Operation *op){ - if (mlir::buddy::isLinalgMatmul(op)) { + if (isLinalgMatmul(op)) { return true; } return false; }; + // pdlV is a handle of op config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) { auto tileConfigAttrs = b.getAttr(llvm::to_vector( llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute { - return b.getI64IntegerAttr(i); + return b.getI64IntegerAttr(i); }))); auto workgroupAttrs = b.getAttr(llvm::to_vector( llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute { - return b.getI64IntegerAttr(i); + return b.getI64IntegerAttr(i); }))); auto stagesAttr = b.getI64IntegerAttr(stages); @@ -58,28 +58,55 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl( /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), /* deduplicate */ false, /* nth_parent */ 1); - - Value tileConfigValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ tileConfigAttrs + + Value tileConfigValue_M = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[0] + ); + + Value tileConfigValue_N = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[1] + ); + + Value tileConfigValue_K = b.create( + // /* type */ pdl::AttributeType::get(b.getContext()), + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[2] + ); + + Value workGroupValue_X = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[0] ); - llvm::errs() << tileConfigValue << "\n"; + Value workGroupValue_Y = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[1] + ); - Value workGroupValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), - /* value */ workgroupAttrs + Value workGroupValue_Z = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[2] ); Value stagesValue = b.create( - /* type */ pdl::AttributeType::get(b.getContext()), + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), /* value */ stagesAttr ); - b.create(func, getGemmTileConfigAttrName(), - tileConfigValue); - b.create(func, getGemmBlockSizeAttrName(), - workGroupValue); + b.create(func, getGemmTileMConfigAttrName(), + tileConfigValue_M); + b.create(func, getGemmTileNConfigAttrName(), + tileConfigValue_N); + b.create(func, getGemmTileKConfigAttrName(), + tileConfigValue_K); + b.create(func, getGemmBlockXSizeAttrName(), + workGroupValue_X); + b.create(func, getGemmBlockYSizeAttrName(), + workGroupValue_Y); + b.create(func, getGemmBlockZSizeAttrName(), + workGroupValue_Z); b.create(func, getGemmPipelineStageAttrName(), stagesValue); }; @@ -89,10 +116,150 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl( } // namespace -void mlir::buddy::createGPUGemmTileConfigInsertTransform( +void mlir::buddy::createGemmTileConfigInsertTransform( OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) { - mlir::buddy::invokeOpPassPipelineBuilder( - createGPUAddGemmCodegenLoweringConfigTransformImpl, pm, + invokeOpPassPipelineBuilder( + createAddGemmCodegenLoweringConfigTransformImpl, pm, options.funcAnchor, options.annotatePrefix, options.tileConfig, options.workGroup, options.stages); +} + +namespace { + +// TODO: Epilogue +void createGemmTileTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op){ + if (isLinalgMatmul(op)) { + return true; + } + return false; + }; + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) { + func::FuncOp funcOp = op->getParentOfType(); + linalg::LinalgOp linalgOp = cast(op); + + SmallVector tileConfig = getGemmTileSize(funcOp).value(); + SmallVector workGroup = getGemmBlockSize(funcOp).value(); + int64_t stages = getGemmPipelineStages(funcOp).value(); + + bool hasEpilogue = false; + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ false, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + auto linalgFillType = transform::OperationType::get( + b.getContext(), linalg::FillOp::getOperationName() + ); + auto linalgFillOp = b.create( + /* resultTypes */ linalgFillType, + /* target */ func, + /* opNames */ linalg::FillOp::getOperationName() + ); + + SmallVector mappingIdx; + bool isBMM = linalgOp.getNumParallelLoops() == 3; + if (isBMM) { + // 2 -> blockIdx.z 1 -> blockIdx.y 0->blockIdx.x + mappingIdx = {2, 1, 0}; + } else { + // 1 -> blockIdx.y 0 -> blockIdx.x + mappingIdx = {1, 0}; + } + + // get GPU BlockIdx mapping + auto mapping = llvm::to_vector(llvm::map_range( + mappingIdx, + [](int64_t i){return static_cast(i); + })); + auto mappingAttrs = llvm::to_vector(llvm::map_range( + mapping, + [&](gpu::MappingId dim) -> Attribute { + return gpu::GPUBlockMappingAttr::get(b.getContext(), dim); + })); + + SmallVector parallelTileSizes; + if (isBMM) { + parallelTileSizes = {1, tileConfig[0], tileConfig[1]}; + } else { + parallelTileSizes = {tileConfig[0], tileConfig[1]}; + } + + // tile DimM and DimN and each tile dispathes to block + Value tiledMatmulOp; + if (hasEpilogue) { + // TODO + } else { + transform::TileUsingForallOp tiledResultOp = + b.create( + /* target */ pdlV, + /* staticTileSizes */ parallelTileSizes, + /* ctor tag */ transform::TileSizesSpec(), + /* mapping */ b.getArrayAttr(mappingAttrs) + ); + + if (linalgFillOp) { + b.create( + /* producerOp */ linalgFillOp, + /* containingOp */ tiledResultOp.getForallOp() + ); + } + tiledMatmulOp = tiledResultOp.getTiledOp(); + } + + // only tile DimK of the matmul which is dispatched to each block + SmallVector reduceTileSize; + if (isBMM) { + reduceTileSize = {0, 0, 0, tileConfig[2]}; + } else { + reduceTileSize = {0, 0, tileConfig[2]}; + } + + auto tiledKMatmulOp = + b.create( + /* target */ tiledMatmulOp, + /* staticTileSizes */ reduceTileSize + ); + + // for k in K steps tileConfig[2] + auto forLoops = tiledKMatmulOp.getLoops(); + // tiledmatmul computes at (BM, BN, tileConfig[2]) + auto kMatmulOp = tiledKMatmulOp.getTiledLinalgOp(); + + if (!forLoops.empty()) { + b.create(forLoops[0], getMatmulKMainLoopMarker(), + Value()); + } else { + b.create(kMatmulOp, getMatmulKMainLoopMarker(), + Value()); + } + + // Value mmaLevel = b.create( + // /* type */ transform::ParamType::get(b.getContext(), b.getStringAttr()), + // /* value */ b.getStringAttr("Threadblock") + // ); + + // b.create(kMatmulOp, getLinalgMMALevelAttrName(), + // mmaLevel); + b.create(kMatmulOp, getMMAPatternAttrName(), + Value()); + }; + pm.addPass(createGenericTransformInsertionPass(config)); +} +} // namespace + +void mlir::buddy::createGemmTileTransform(OpPassManager &pm, + const GPUGemmGeneralOptions &options) { + invokeOpPassPipelineBuilder( + createGemmTileTransformImpl, pm, + options.funcAnchor, options.annotatePrefix); } \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Pipelines/GPU/Utils.cpp index 7298195995..67a6b61e70 100644 --- a/midend/lib/Pipelines/GPU/Utils.cpp +++ b/midend/lib/Pipelines/GPU/Utils.cpp @@ -3,9 +3,12 @@ #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "Pipelines/GPU/Utils.h" +#include + using namespace mlir; using namespace llvm; @@ -48,5 +51,59 @@ bool isLinalgMatmul(Operation *op) { } } + +std::optional> getGemmTileSize(func::FuncOp funcOp) { + if (funcOp->hasAttr(getGemmTileMConfigAttrName()) && + funcOp->hasAttr(getGemmTileNConfigAttrName()) && + funcOp->hasAttr(getGemmTileKConfigAttrName())) { + auto tileConfigM = funcOp->getAttrOfType( + getGemmTileMConfigAttrName()).getInt(); + auto tileConfigN = funcOp->getAttrOfType( + getGemmTileNConfigAttrName()).getInt(); + auto tileConfigK = funcOp->getAttrOfType( + getGemmTileKConfigAttrName()).getInt(); + + llvm::SmallVector configVec = {tileConfigM, tileConfigN, tileConfigK}; + + return configVec; + } + return std::nullopt; +} + +std::optional> getGemmBlockSize(func::FuncOp funcOp) { + if (funcOp->hasAttr(getGemmBlockXSizeAttrName()) && + funcOp->hasAttr(getGemmBlockYSizeAttrName()) && + funcOp->hasAttr(getGemmBlockZSizeAttrName())) { + auto blockSizeX = funcOp->getAttrOfType( + getGemmBlockXSizeAttrName()).getInt(); + auto blockSizeY = funcOp->getAttrOfType( + getGemmBlockYSizeAttrName()).getInt(); + auto blockSizeZ = funcOp->getAttrOfType( + getGemmBlockZSizeAttrName()).getInt(); + + llvm::SmallVector blockSizeVec = {blockSizeX, blockSizeY, blockSizeZ}; + + return blockSizeVec; + } + return std::nullopt; +} + +std::optional getGemmPipelineStages(func::FuncOp funcOp) { + if (funcOp->hasAttr(getGemmPipelineStageAttrName())) { + auto stages = funcOp->getAttrOfType( + getGemmPipelineStageAttrName()).getInt(); + return stages; + } + return std::nullopt; +} + +void setMarker(Operation *op, StringRef marker) { + op->setAttr(marker, UnitAttr::get(op->getContext())); +} + +bool hasMarker(Operation *op, StringRef marker) { + return op->hasAttrOfType(marker); +} + } // namespace mlir::buddy } // namespace mlir \ No newline at end of file diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp index a0bce01fd3..ff688eb3c1 100644 --- a/midend/lib/Pipelines/LinalgTensorOpt.cpp +++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp @@ -13,14 +13,20 @@ namespace { void addGPULinalgOptPasses(OpPassManager &pm) { { // Gemm Codegen Linalg Tensor Opt + // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK SmallVector tileConfig = {32, 32, 16}; + // blockIdx.x y z SmallVector workGroup = {32, 2, 1}; int64_t stages = 3; - mlir::buddy::GPUGemmCodegenConfigOptions configOption; - configOption.tileConfig = tileConfig; - configOption.workGroup = workGroup; - configOption.stages = stages; - createGPUGemmTileConfigInsertTransform(pm, configOption); + mlir::buddy::GPUGemmCodegenConfigOptions configOptions; + configOptions.tileConfig = tileConfig; + configOptions.workGroup = workGroup; + configOptions.stages = stages; + createGemmTileConfigInsertTransform(pm, configOptions); + pm.addPass(createTransformDialectInterpreter(true)); + + mlir::buddy::GPUGemmGeneralOptions generalOptions; + createGemmTileTransform(pm, generalOptions); pm.addPass(createTransformDialectInterpreter(true)); } } From 63ad6c961ee2f9e97102180af528479d6133c528 Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Sat, 14 Sep 2024 17:03:51 +0800 Subject: [PATCH 6/7] add linalg-promotion pass --- examples/BuddyMatmul/linalg-matmul.mlir | 18 +- midend/include/Dialect/CMakeLists.txt | 3 +- midend/include/Dialect/GPU/Passes.h | 0 midend/include/Dialect/GPU/Passes.td | 3 + midend/include/Dialect/Linalg/CMakeLists.txt | 3 + midend/include/Dialect/Linalg/Passes.h | 15 ++ midend/include/Dialect/Linalg/Passes.td | 6 + .../Linalg/Transforms/LinalgPromotion.h | 16 ++ midend/include/Pipelines/BufferizeOpt.h | 29 +++ midend/include/Pipelines/LinalgMemrefOpt.h | 29 +++ .../GPU/Utils.h => Utils/GemmCodegenUtils.h} | 68 ++++-- midend/include/Utils/PipelineUtils.h | 34 +++ midend/lib/Dialect/CMakeLists.txt | 1 + midend/lib/Dialect/Linalg/CMakeLists.txt | 1 + .../Dialect/Linalg/Transforms/CMakeLists.txt | 11 + .../Linalg/Transforms/LinalgPromotion.cpp | 205 ++++++++++++++++++ .../Dialect/Linalg/Transforms/PassDetail.h | 13 ++ midend/lib/Pipelines/BufferizeOpt.cpp | 35 +++ midend/lib/Pipelines/CMakeLists.txt | 6 +- midend/lib/Pipelines/GPU/CMakeLists.txt | 2 +- .../Pipelines/GPU/GemmCodegenTransform.cpp | 3 +- midend/lib/Pipelines/LinalgMemrefOpt.cpp | 38 ++++ midend/lib/Pipelines/LinalgTensorOpt.cpp | 24 +- midend/lib/Utils/CMakeLists.txt | 16 ++ .../Utils.cpp => Utils/GemmCodegenUtils.cpp} | 63 +++++- midend/lib/Utils/PipelineUtils.cpp | 12 + tools/buddy-opt/CMakeLists.txt | 1 + tools/buddy-opt/buddy-opt.cpp | 5 +- 28 files changed, 612 insertions(+), 48 deletions(-) create mode 100644 midend/include/Dialect/GPU/Passes.h create mode 100644 midend/include/Dialect/GPU/Passes.td create mode 100644 midend/include/Dialect/Linalg/CMakeLists.txt create mode 100644 midend/include/Dialect/Linalg/Passes.h create mode 100644 midend/include/Dialect/Linalg/Passes.td create mode 100644 midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h create mode 100644 midend/include/Pipelines/BufferizeOpt.h create mode 100644 midend/include/Pipelines/LinalgMemrefOpt.h rename midend/include/{Pipelines/GPU/Utils.h => Utils/GemmCodegenUtils.h} (54%) create mode 100644 midend/include/Utils/PipelineUtils.h create mode 100644 midend/lib/Dialect/Linalg/CMakeLists.txt create mode 100644 midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt create mode 100644 midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp create mode 100644 midend/lib/Dialect/Linalg/Transforms/PassDetail.h create mode 100644 midend/lib/Pipelines/BufferizeOpt.cpp create mode 100644 midend/lib/Pipelines/LinalgMemrefOpt.cpp rename midend/lib/{Pipelines/GPU/Utils.cpp => Utils/GemmCodegenUtils.cpp} (69%) create mode 100644 midend/lib/Utils/PipelineUtils.cpp diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir index 72a70d48aa..2690acb3b6 100644 --- a/examples/BuddyMatmul/linalg-matmul.mlir +++ b/examples/BuddyMatmul/linalg-matmul.mlir @@ -2,20 +2,20 @@ #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> module attributes {} { - func.func private @Matmul(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} { + func.func private @Matmul(%arg0: tensor<1024x512xf16>, %arg1: tensor<512x1024xf16>) -> tensor<1024x1024xf16> attributes {} { %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1280x1280xf16> - %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16> - %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1280x32xf16>, tensor<32x1280xf16>) outs(%1 : tensor<1280x1280xf16>) { + %0 = tensor.empty() : tensor<1024x1024xf16> + %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1024x512xf16>, tensor<512x1024xf16>) outs(%1 : tensor<1024x1024xf16>) { ^bb0(%in: f16, %in_0: f16, %out: f16): %3 = arith.mulf %in, %in_0 : f16 %4 = arith.addf %out, %3 : f16 linalg.yield %4 : f16 - } -> tensor<1280x1280xf16> - return %2 : tensor<1280x1280xf16> + } -> tensor<1024x1024xf16> + return %2 : tensor<1024x1024xf16> } - func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> { - %0 = call @Matmul(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16> - return %0 : tensor<1280x1280xf16> + func.func @forward(%arg0: tensor<1024x512xf16>, %arg1: tensor<512x1024xf16>) -> tensor<1024x1024xf16> { + %0 = call @Matmul(%arg0, %arg1) : (tensor<1024x512xf16>, tensor<512x1024xf16>) -> tensor<1024x1024xf16> + return %0 : tensor<1024x1024xf16> } } diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt index 7b472d0d2d..9bae4c7796 100644 --- a/midend/include/Dialect/CMakeLists.txt +++ b/midend/include/Dialect/CMakeLists.txt @@ -5,4 +5,5 @@ add_subdirectory(RVV) add_subdirectory(VectorExp) add_subdirectory(Gemmini) add_subdirectory(Sche) -add_subdirectory(Transform) \ No newline at end of file +add_subdirectory(Transform) +add_subdirectory(Linalg) \ No newline at end of file diff --git a/midend/include/Dialect/GPU/Passes.h b/midend/include/Dialect/GPU/Passes.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/midend/include/Dialect/GPU/Passes.td b/midend/include/Dialect/GPU/Passes.td new file mode 100644 index 0000000000..b247a5373f --- /dev/null +++ b/midend/include/Dialect/GPU/Passes.td @@ -0,0 +1,3 @@ +include "mlir/Pass/PassBase.td" + +def \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/CMakeLists.txt b/midend/include/Dialect/Linalg/CMakeLists.txt new file mode 100644 index 0000000000..1ef1e9b24e --- /dev/null +++ b/midend/include/Dialect/Linalg/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls) +add_public_tablegen_target(BuddyLinalgPassIncGen) \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/Passes.h b/midend/include/Dialect/Linalg/Passes.h new file mode 100644 index 0000000000..a9982e5404 --- /dev/null +++ b/midend/include/Dialect/Linalg/Passes.h @@ -0,0 +1,15 @@ +#ifndef DIALECT_LINALG_PASSES_H +#define DIALECT_LINALG_PASSES_H + +// Include the constructor of passes in Linalg Dialect +#include "Linalg/Transforms/LinalgPromotion.h" + +namespace mlir { +// Generate the definition of Linalg Passes +#define GEN_PASS_DECL +#include "Linalg/Passes.h.inc" + +#define GEN_PASS_REGISTRATION +#include "Linalg/Passes.h.inc" + +} // namespace mlir \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/Passes.td b/midend/include/Dialect/Linalg/Passes.td new file mode 100644 index 0000000000..d959b63284 --- /dev/null +++ b/midend/include/Dialect/Linalg/Passes.td @@ -0,0 +1,6 @@ +include "mlir/Pass/PassBase.td" + +def LinalgPromotion : Pass<"linalg-promotion", "func::FuncOp"> { + let summary = "promote Linalg's MatmulOp operand subview to memref.alloca and linalg.copy"; + let constructor = "mlir::createLinalgPromotionPass()"; +} \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h new file mode 100644 index 0000000000..8405e24bb1 --- /dev/null +++ b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h @@ -0,0 +1,16 @@ +#ifndef LINALG_TRANSFORMS_LINALGPROMOTION_H +#define LINALG_TRANSFORMS_LINALGPROMOTION_H + +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/StringRef.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include + +namespace mlir { + +std::unique_ptr> +createLinalgPromotionPass(); + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/BufferizeOpt.h b/midend/include/Pipelines/BufferizeOpt.h new file mode 100644 index 0000000000..a459863cf8 --- /dev/null +++ b/midend/include/Pipelines/BufferizeOpt.h @@ -0,0 +1,29 @@ +#ifndef PIPELINES_BUFFERIZEOPT_H +#define PIPELINES_BUFFERIZEOPT_H + +#include "Pipelines/LinalgTensorOpt.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include + +namespace mlir { +namespace buddy { + +struct BuddyBufferizeOptOptions : + public PassPipelineOptions { + Option target { + *this, "target", + llvm::cl::desc("An option to specify target"), + }; +}; + +void createBufferizeOptPipeline(OpPassManager &pm, const BuddyBufferizeOptOptions &options); + +void registerBufferizeOptPassPipeline(); + +} // namespace mlir::buddy +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/LinalgMemrefOpt.h b/midend/include/Pipelines/LinalgMemrefOpt.h new file mode 100644 index 0000000000..b61032efef --- /dev/null +++ b/midend/include/Pipelines/LinalgMemrefOpt.h @@ -0,0 +1,29 @@ +#ifndef PIPELINES_MEMREFOPT_H +#define PIPELINES_MEMREFOPT_H + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include + +namespace mlir { +namespace buddy { + +struct LinalgMemrefOptPipelineOptions : + public PassPipelineOptions { + Option target { + *this, "target", + llvm::cl::desc("An optional attribute to speicify target."), + }; +}; + +void createLinalgMemrefOptPipeline(OpPassManager &pm, + const LinalgMemrefOptPipelineOptions &options); + +void registerLinalgMemrefOptPipeline(); + +} // mlir::buddy +} // mlir + +#endif \ No newline at end of file diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Utils/GemmCodegenUtils.h similarity index 54% rename from midend/include/Pipelines/GPU/Utils.h rename to midend/include/Utils/GemmCodegenUtils.h index 7821b53c31..69c4acae6c 100644 --- a/midend/include/Pipelines/GPU/Utils.h +++ b/midend/include/Utils/GemmCodegenUtils.h @@ -1,8 +1,12 @@ #ifndef PIPELINES_GPU_UTILS_H #define PIPELINES_GPU_UTILS_H +#include "mlir/IR/OperationSupport.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" +#include "llvm/ADT/StringRef.h" +#include +#include namespace mlir { class ModuleOp; @@ -13,25 +17,15 @@ class FuncOp; namespace buddy { -template -void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, - Args &&...args) { - if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() && - pm.getOpAnchorName() != OpClass::getOperationName()) { - if (pm.getNesting() == OpPassManager::Nesting::Implicit) { - builder(pm.nest(), std::forward(args)...); - return; - } - llvm::report_fatal_error( - llvm::Twine("Can't build pass pipeline on expected op type ") + - OpClass::getOperationName() + " but got " + pm.getOpAnchorName()); - } else { - builder(pm, std::forward(args)...); - } -} - bool isLinalgMatmul(Operation *op); +void setMarker(mlir::Operation *op, llvm::StringRef marker); + +bool hasMarker(Operation *op, StringRef marker); + +static constexpr StringRef getGemmMarkerAttrName() { + return "__buddy_gemm__"; +} static constexpr StringRef getGemmTileMConfigAttrName() { return "__buddy_gemm_tile_config__M"; @@ -65,12 +59,37 @@ static constexpr StringRef getMatmulKMainLoopMarker() { return "__buddy_gemm_main_loopk__"; } -constexpr StringRef getLinalgMMALevelAttrName() { - return "__buddy_mma_level__"; +static constexpr StringRef getLinalgMMALevelAttrName() { + return "__buddy_mma_level__"; } -constexpr StringRef getMMAPatternAttrName() { return "__buddy_mma__"; } +static constexpr StringRef getMMAPatternAttrName() { + return "__buddy_mma__"; +} +static constexpr StringRef getAllocSharedMemoryAMarker() { + return "__buddy_smem_matrix_a__"; +}; + +static constexpr StringRef getAllocSharedMemoryBMarker() { + return "__buddy_smem_matrix_b__"; +}; + +static constexpr StringRef getAllocSharedMemoryAccMarker() { + return "__buddy_smem_accumulator__"; +}; + +static constexpr StringRef getCopyToSharedMemoryAMarker() { + return "__buddy_load_matrix_a__"; +}; + +static constexpr StringRef getCopyToSharedMemoryBMarker() { + return "__buddy_load_matrix_b__"; +}; + +static constexpr StringRef getCopyFromSharedMemoryAccMarker() { + return "__buddy_store_matrix_c__"; +}; std::optional> getGemmTileSize(func::FuncOp funcOp); @@ -78,12 +97,13 @@ std::optional> getGemmBlockSize(func::FuncOp funcOp); std::optional getGemmPipelineStages(func::FuncOp funcOp); -void setMarker(mlir::Operation *op, llvm::StringRef marker); +bool funcHasGemm(func::FuncOp funcOp); -bool hasMarker(Operation *op, StringRef marker); +bool isMappedToGPUBlock(scf::ForallOp forallOp); +std::optional getForallOpMappedToBlock(func::FuncOp funcOp); -} // namespace buddy::pipelines -} // namespace buddy +} // namespace mlir::buddy +} // namespace mlir #endif \ No newline at end of file diff --git a/midend/include/Utils/PipelineUtils.h b/midend/include/Utils/PipelineUtils.h new file mode 100644 index 0000000000..215810c40c --- /dev/null +++ b/midend/include/Utils/PipelineUtils.h @@ -0,0 +1,34 @@ +#ifndef UTILS_PIPELINEUTILS_H +#define UTILS_PIPELINEUTILS_H + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" + +namespace mlir { +class ModuleOp; +namespace buddy { + +template +void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm, + Args &&...args) { + if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() && + pm.getOpAnchorName() != OpClass::getOperationName()) { + if (pm.getNesting() == OpPassManager::Nesting::Implicit) { + builder(pm.nest(), std::forward(args)...); + return; + } + llvm::report_fatal_error( + llvm::Twine("Can't build pass pipeline on expected op type ") + + OpClass::getOperationName() + " but got " + pm.getOpAnchorName()); + } else { + builder(pm, std::forward(args)...); + } +} + +void addCleanUpPassPipeline(OpPassManager &pm, bool isModuleOp = true); + +} // namespace mlir::buddy +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt index 304de8570c..f0e2ac6c13 100644 --- a/midend/lib/Dialect/CMakeLists.txt +++ b/midend/lib/Dialect/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(VectorExp) add_subdirectory(Gemmini) add_subdirectory(Sche) add_subdirectory(Transform) +add_subdirectory(Linalg) diff --git a/midend/lib/Dialect/Linalg/CMakeLists.txt b/midend/lib/Dialect/Linalg/CMakeLists.txt new file mode 100644 index 0000000000..5c919f7dfc --- /dev/null +++ b/midend/lib/Dialect/Linalg/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Transforms) \ No newline at end of file diff --git a/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt new file mode 100644 index 0000000000..8a69b82a09 --- /dev/null +++ b/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_dialect_library(BuddyLinalgPasses + LinalgPromotion.cpp + + DEPENDS + BuddyLinalgPassIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass + MLIRTransformDialect +) diff --git a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp new file mode 100644 index 0000000000..31f4225e43 --- /dev/null +++ b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp @@ -0,0 +1,205 @@ +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributeInterfaces.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/Support/LLVM.h" + +#include "Linalg/Transforms/LinalgPromotion.h" +#include "PassDetail.h" +#include "Utils/GemmCodegenUtils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace mlir::buddy; + +namespace { + +constexpr int64_t MatmulOperand_A = 0; +constexpr int64_t MatmulOperand_B = 1; +constexpr int64_t MatmulOperand_C = 2; + +constexpr StringRef allocMarker[3] = {getAllocSharedMemoryAMarker(), + getAllocSharedMemoryBMarker(), + getAllocSharedMemoryAccMarker()}; + +constexpr StringRef copyMarker[3] = {getCopyToSharedMemoryAMarker(), + getCopyToSharedMemoryBMarker(), + getCopyFromSharedMemoryAccMarker()}; + +template +std::optional allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview, + ArrayRef boundingSubViewSize, DataLayout &layout) { + + OpBuilder::InsertionGuard guard(builder); + + scf::ForallOp forallOp = subview->getParentOfType(); + if (!forallOp) { + return std::nullopt; + } + + SmallVector shapes; + for (Value bound : boundingSubViewSize) { + APInt value; + if (!matchPattern(bound, m_ConstantInt(&value))) { + return std::nullopt; + } + shapes.push_back(value.getSExtValue()); + } + + builder.setInsertionPointToStart(forallOp.getBody()); + + auto smemBufferType = MemRefType::get(shapes, subview.getType().getElementType(), + MemRefLayoutAttrInterface{}, + gpu::AddressSpaceAttr::get(builder.getContext(), + gpu::GPUDialect::getWorkgroupAddressSpace())); + + memref::AllocOp smemBuffer = builder.create(forallOp.getLoc(), smemBufferType); + setMarker(smemBuffer, allocMarker[OPERAND]); + + return smemBuffer; + +} + +// nvgpu device mem no need to manually deallocate +LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/) { + return success(); +} + +template +LogicalResult copyGlobalMemoryToWorkgroupMemory(OpBuilder &b, Value src, + Value dst) { + if (OPERAND == MatmulOperand_C) { + return success(); + } + auto copyOp = b.create(src.getLoc(), src, dst); + setMarker(copyOp, copyMarker[OPERAND]); + return success(); +} + +template +LogicalResult copyWorkgroupMemoryToGlobalMemory(OpBuilder &b, Value src, + Value dst) { + OpBuilder::InsertionGuard guard(b); + if (OPERAND == MatmulOperand_A || + OPERAND == MatmulOperand_B) { + return success(); + } + auto op = src.getDefiningOp(); + + // Because MatmulC Operand is out of scf::ForOp so we need to get ForallOp first + scf::ForallOp forallOp = op->getParentOfType(); + auto forOps = llvm::to_vector(forallOp.getOps()); + if (forOps.size() == 1) { + // set copymem op insertion point after compute block + b.setInsertionPointAfter(forOps[0]); + } + if (forOps.size() > 1) { + return failure(); + } + b.create(src.getLoc()); + linalg::CopyOp copyOp = b.create(src.getLoc(), src, dst); + setMarker(copyOp, copyMarker[MatmulOperand_C]); + return success(); +} + + +template +static linalg::LinalgPromotionOptions getPromotionOptionsForMatmulOperand() { + linalg::LinalgPromotionOptions promotionOptions; + promotionOptions + .setAllocationDeallocationFns(allocateWorkgroupMemory, + deallocateWorkgroupMemory) + .setCopyInOutFns(copyGlobalMemoryToWorkgroupMemory, + copyWorkgroupMemoryToGlobalMemory) + .setOperandsToPromote({OPERAND}) + .setUseFullTileBuffers({false, false}); + return promotionOptions; +} + +template +static LogicalResult promotionImpl(OpBuilder &builder, Operation *op) { + linalg::LinalgPromotionOptions promotionOptions = + getPromotionOptionsForMatmulOperand(); + + if (failed(linalg::promoteSubviewsPrecondition(op, promotionOptions))) { + return failure(); + } + + std::optional promotedLinalgOp = + linalg::promoteSubViews(builder, cast(op), promotionOptions); + + if (!promotedLinalgOp) { + return op->emitError("subview promotion failed!"); + } + return success(); +} + +struct LinalgPromotionPass : public LinalgPromotionBase { +public: + LinalgPromotionPass() = default; + + void runOnOperation() override { + // the whole promotion pipeline is + // split m, n, promote C, split k, promote A & B + func::FuncOp funcOp = getOperation(); + SmallVector LinalgOpsToPromote; + + if (!funcHasGemm(funcOp)) { + return; + } + + if (!getForallOpMappedToBlock(funcOp)) { + return; + } + scf::ForallOp forallOp = getForallOpMappedToBlock(funcOp).value(); + + forallOp->walk([&](linalg::LinalgOp linalgOp) { + if (isLinalgMatmul(linalgOp)) { + LinalgOpsToPromote.push_back(linalgOp); + } + }); + if (LinalgOpsToPromote.empty()) { + return; + } + assert(LinalgOpsToPromote.size() == 1); + auto linalgContractOp = LinalgOpsToPromote[0]; + + // set Builder insertion point before the linalgContractOp + OpBuilder b(linalgContractOp); + promotionImpl(b, linalgContractOp); + promotionImpl(b, linalgContractOp); + + // set the insertion before forop to alloc MatrixC + scf::ForOp forOp = linalgContractOp->getParentOfType(); + if (!forOp) { + b.setInsertionPoint(linalgContractOp); + } else { + b.setInsertionPoint(forOp); + } + + promotionImpl(b, linalgContractOp); + + + } +}; +} // namespace mlir + +std::unique_ptr> +mlir::createLinalgPromotionPass() { + return std::make_unique(); +} diff --git a/midend/lib/Dialect/Linalg/Transforms/PassDetail.h b/midend/lib/Dialect/Linalg/Transforms/PassDetail.h new file mode 100644 index 0000000000..9b5555267d --- /dev/null +++ b/midend/lib/Dialect/Linalg/Transforms/PassDetail.h @@ -0,0 +1,13 @@ +#ifndef DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H +#define DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { + +#define GEN_PASS_CLASSES +#include "Linalg/Passes.h.inc" + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/lib/Pipelines/BufferizeOpt.cpp b/midend/lib/Pipelines/BufferizeOpt.cpp new file mode 100644 index 0000000000..b7abdb3680 --- /dev/null +++ b/midend/lib/Pipelines/BufferizeOpt.cpp @@ -0,0 +1,35 @@ +#include "Pipelines/BufferizeOpt.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" + +using namespace mlir; + +void mlir::buddy::createBufferizeOptPipeline(OpPassManager &pm, + const BuddyBufferizeOptOptions &options) { + mlir::buddy::invokeOpPassPipelineBuilder( + [&](OpPassManager &pm) { + // OneShotBufferization not implement bufferize on funcOp's arguments on default + bufferization::OneShotBufferizationOptions bufferizeOptions; + bufferizeOptions.bufferizeFunctionBoundaries = true; + // bufferizeOptions.allowReturnAllocsFromLoops + pm.addNestedPass(bufferization::createEmptyTensorEliminationPass()); + pm.addPass(bufferization::createOneShotBufferizePass(bufferizeOptions)); + pm.addNestedPass(memref::createFoldMemRefAliasOpsPass()); + addCleanUpPassPipeline(pm); + }, pm); +} + +void mlir::buddy::registerBufferizeOptPassPipeline() { + PassPipelineRegistration( + "bufferize-opt", + "bufferize opt lowering tensor to memref", + createBufferizeOptPipeline + ); +} \ No newline at end of file diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt index ed97f0e647..59cd289a1c 100644 --- a/midend/lib/Pipelines/CMakeLists.txt +++ b/midend/lib/Pipelines/CMakeLists.txt @@ -2,9 +2,13 @@ add_subdirectory(GPU) add_mlir_library(BuddyPipelines LinalgTensorOpt.cpp - GPU/Utils.cpp + BufferizeOpt.cpp + LinalgMemrefOpt.cpp + LINK_LIBS PUBLIC MLIRIR BuddyGPUPipelines BuddyTransformPasses + BuddyGemmCodegenUtils + BuddyPipelineUtils ) \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/CMakeLists.txt b/midend/lib/Pipelines/GPU/CMakeLists.txt index 1f5a151312..5ca25dd744 100644 --- a/midend/lib/Pipelines/GPU/CMakeLists.txt +++ b/midend/lib/Pipelines/GPU/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_library(BuddyGPUPipelines GemmCodegenTransform.cpp - Utils.cpp LINK_LIBS PUBLIC MLIRIR @@ -8,4 +7,5 @@ add_mlir_library(BuddyGPUPipelines MLIRTransformDialect MLIRTransforms BuddyTransformPasses + BuddyGemmCodegenUtils ) \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp index 0b3c8f3e4f..b4e599da09 100644 --- a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp +++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp @@ -1,6 +1,7 @@ #include "Transform/Transforms/TransformInsertion.h" #include "Pipelines/GPU/GemmCodegenTransform.h" -#include "Pipelines/GPU/Utils.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" #include "mlir/Dialect/Transform/IR/TransformOps.h" #include "mlir/Dialect/PDL/IR/PDLOps.h" diff --git a/midend/lib/Pipelines/LinalgMemrefOpt.cpp b/midend/lib/Pipelines/LinalgMemrefOpt.cpp new file mode 100644 index 0000000000..a7d5538162 --- /dev/null +++ b/midend/lib/Pipelines/LinalgMemrefOpt.cpp @@ -0,0 +1,38 @@ +#include "Pipelines/LinalgMemrefOpt.h" +#include "Linalg/Transforms/LinalgPromotion.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" +#include "Utils/PipelineUtils.h" +#include "mlir/Transforms/Passes.h" +#include + +using namespace mlir; + +namespace { + +void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) { + // TODO : use funcAnchor to nest the specific matmul func + pm.addNestedPass(createLinalgPromotionPass()); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createCSEPass()); + pm.addNestedPass(createCanonicalizerPass()); +} + +void createLinalgMemrefOptPipelineImpl(OpPassManager &pm, + const std::string target) { + addGemmLinalgMemrefOptPipeline(pm); +} + +} + +void mlir::buddy::createLinalgMemrefOptPipeline(OpPassManager &pm, + const LinalgMemrefOptPipelineOptions &options) { + invokeOpPassPipelineBuilder(createLinalgMemrefOptPipelineImpl, pm, options.target); +} + +void mlir::buddy::registerLinalgMemrefOptPipeline() { + PassPipelineRegistration( + "linalg-memref-opt", "Linalg Opt Pipeline with Memref", + createLinalgMemrefOptPipeline); +} \ No newline at end of file diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp index ff688eb3c1..8ee373fda2 100644 --- a/midend/lib/Pipelines/LinalgTensorOpt.cpp +++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp @@ -1,11 +1,15 @@ #include "Pipelines/LinalgTensorOpt.h" -#include "Pipelines/GPU/Utils.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" #include "Pipelines/GPU/GemmCodegenTransform.h" #include "Transform/Transforms/TransformDialectInterpreter.h" #include "Transform/Transforms/TransformInsertion.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/Passes.h" using namespace mlir; @@ -13,10 +17,16 @@ namespace { void addGPULinalgOptPasses(OpPassManager &pm) { { // Gemm Codegen Linalg Tensor Opt + // TODO : to mark the func that has gemm linalg op + // now the below option's funcanchor is set to empty + // which considers that all func has matmul op + auto funcGemmAnchor = mlir::buddy::getGemmMarkerAttrName().str(); // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK - SmallVector tileConfig = {32, 32, 16}; - // blockIdx.x y z - SmallVector workGroup = {32, 2, 1}; + // blockIdx.y = M / BM + // blockIdx.x = N / BN + SmallVector tileConfig = {128, 128, 64}; + // threadIdx.x y z + SmallVector workGroup = {32, 4, 1}; int64_t stages = 3; mlir::buddy::GPUGemmCodegenConfigOptions configOptions; configOptions.tileConfig = tileConfig; @@ -28,6 +38,8 @@ void addGPULinalgOptPasses(OpPassManager &pm) { mlir::buddy::GPUGemmGeneralOptions generalOptions; createGemmTileTransform(pm, generalOptions); pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); } } @@ -50,5 +62,7 @@ void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm, void mlir::buddy::registerLinalgTensorOptPassPipeline() { PassPipelineRegistration( - "linalg-tensor-opt", "Linalg with Tensor Opt Pass Pipeline", mlir::buddy::createLinalgTensorOptPassPipeline); + "linalg-tensor-opt", + "Linalg with Tensor Opt Pass Pipeline", + mlir::buddy::createLinalgTensorOptPassPipeline); } diff --git a/midend/lib/Utils/CMakeLists.txt b/midend/lib/Utils/CMakeLists.txt index ff9aa6e380..f9101f1b64 100644 --- a/midend/lib/Utils/CMakeLists.txt +++ b/midend/lib/Utils/CMakeLists.txt @@ -3,6 +3,8 @@ add_mlir_library(BuddyUtils DIPUtils.cpp DAPUtils.cpp AffineTransformUtils.cpp + GemmCodegenUtils.cpp + PipelineUtils.cpp ) add_mlir_library(BuddyDIPUtils @@ -18,3 +20,17 @@ add_mlir_library(BuddyDAPUtils LINK_LIBS PUBLIC BuddyUtils ) + +add_mlir_library(BuddyGemmCodegenUtils + GemmCodegenUtils.cpp + + LINK_LIBS PUBLIC + BuddyUtils +) + +add_mlir_library(BuddyPipelineUtils + PipelineUtils.cpp + + LINK_LIBS PUBLIC + BuddyUtils +) \ No newline at end of file diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Utils/GemmCodegenUtils.cpp similarity index 69% rename from midend/lib/Pipelines/GPU/Utils.cpp rename to midend/lib/Utils/GemmCodegenUtils.cpp index 67a6b61e70..8b087846d3 100644 --- a/midend/lib/Pipelines/GPU/Utils.cpp +++ b/midend/lib/Utils/GemmCodegenUtils.cpp @@ -2,10 +2,17 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "Pipelines/GPU/Utils.h" +#include "Utils/GemmCodegenUtils.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/MLIRContext.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" #include @@ -15,6 +22,14 @@ using namespace llvm; namespace mlir { namespace buddy { +void setMarker(Operation *op, StringRef marker) { + op->setAttr(marker, UnitAttr::get(op->getContext())); +} + +bool hasMarker(Operation *op, StringRef marker) { + return op->hasAttrOfType(marker); +} + bool isLinalgMatmul(Operation *op) { if (!llvm::isa(op)) { return false; @@ -97,12 +112,50 @@ std::optional getGemmPipelineStages(func::FuncOp funcOp) { return std::nullopt; } -void setMarker(Operation *op, StringRef marker) { - op->setAttr(marker, UnitAttr::get(op->getContext())); +bool funcHasGemm(func::FuncOp funcOp) { + // TODO + return true; } -bool hasMarker(Operation *op, StringRef marker) { - return op->hasAttrOfType(marker); +bool isMappedToGPUBlock(scf::ForallOp forallOp) { + + SmallVector mappingIdx{2, 1, 0}; + MLIRContext ctx; + ctx.loadDialect(); + auto mapping = llvm::to_vector(llvm::map_range( + mappingIdx, + [](int64_t i){return static_cast(i);})); + auto mappingAttrs = llvm::to_vector(llvm::map_range( + mapping, + [&](gpu::MappingId dim) -> Attribute { + return gpu::GPUBlockMappingAttr::get(&ctx, dim);})); + ArrayAttr getMappingAttrs = forallOp->getAttrOfType("mapping"); + + if (!getMappingAttrs) { + return false; + } else { + for (auto mappingAttr : getMappingAttrs) { + if (mappingAttr.isa()) { + return true; + } + } + return false; + } +} + +std::optional getForallOpMappedToBlock(func::FuncOp funcOp) { + SmallVector forallOps; + funcOp->walk([&](scf::ForallOp forallOp){ + if (isMappedToGPUBlock(forallOp)) { + forallOps.push_back(forallOp); + } + }); + // one func one kernel -> one func only have one forallOp mapped to block + if (forallOps.size() != 1) { + llvm::errs() << "this funcOp has no ForallOp MappedToBlock\n"; + return std::nullopt; + } + return forallOps[0]; } } // namespace mlir::buddy diff --git a/midend/lib/Utils/PipelineUtils.cpp b/midend/lib/Utils/PipelineUtils.cpp new file mode 100644 index 0000000000..34c4600e38 --- /dev/null +++ b/midend/lib/Utils/PipelineUtils.cpp @@ -0,0 +1,12 @@ +#include "Utils/PipelineUtils.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +void mlir::buddy::addCleanUpPassPipeline(OpPassManager &pm, bool isModuleOp) { + pm.addPass(createCSEPass()); + pm.addPass(createCanonicalizerPass()); + if (isModuleOp) { + pm.addPass(createSymbolDCEPass()); + } +} \ No newline at end of file diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt index 7a87da7f24..1b448824d3 100644 --- a/tools/buddy-opt/CMakeLists.txt +++ b/tools/buddy-opt/CMakeLists.txt @@ -40,4 +40,5 @@ target_link_libraries(buddy-opt BuddyPipelines BuddyGPUPipelines BuddyTransformPasses + BuddyLinalgPasses ) diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp index fc96aa8bae..178532da11 100644 --- a/tools/buddy-opt/buddy-opt.cpp +++ b/tools/buddy-opt/buddy-opt.cpp @@ -47,7 +47,6 @@ #include "Gemmini/GemminiOps.h" #include "Sche/ScheDialect.h" #include "Sche/ScheOps.h" -#include "Pipelines/LinalgTensorOpt.h" namespace mlir { namespace buddy { @@ -73,6 +72,8 @@ void registerDeviceSchedulePass(); void registerLowerSchePass(); void registerFuncBufferizeDynamicOffsetPass(); void registerLinalgTensorOptPassPipeline(); +void registerBufferizeOptPassPipeline(); +void registerLinalgMemrefOptPipeline(); } // namespace buddy } // namespace mlir @@ -108,6 +109,8 @@ int main(int argc, char **argv) { // Register Pipeline Passes mlir::buddy::registerLinalgTensorOptPassPipeline(); + mlir::buddy::registerLinalgMemrefOptPipeline(); + mlir::buddy::registerBufferizeOptPassPipeline(); mlir::DialectRegistry registry; // Register all MLIR core dialects. From 46244c0e2bf6cdca92df4398cbb02bbe080a0652 Mon Sep 17 00:00:00 2001 From: CJ77Qi Date: Fri, 20 Sep 2024 15:34:46 +0800 Subject: [PATCH 7/7] add GPUDistributeToWarpPass & RemoveRedundantPass in GPU GEMM Codegen Pipeline --- midend/include/Dialect/CMakeLists.txt | 3 +- midend/include/Dialect/GPU/CMakeLists.txt | 3 + midend/include/Dialect/GPU/Passes.h | 18 + midend/include/Dialect/GPU/Passes.td | 10 +- .../GPU/Transforms/GPUDistributeToWarp.h | 15 + .../GPU/Transforms/RemoveReduntantLoops.h | 16 + midend/include/Dialect/Linalg/Passes.h | 4 +- .../Linalg/Transforms/LinalgPromotion.h | 4 +- midend/include/Dialect/Transform/Passes.h | 7 +- midend/lib/Dialect/CMakeLists.txt | 1 + midend/lib/Dialect/GPU/CMakeLists.txt | 1 + .../lib/Dialect/GPU/Transforms/CMakeLists.txt | 11 + .../GPU/Transforms/GPUDistributeToWarp.cpp | 233 +++++++++++++ .../lib/Dialect/GPU/Transforms/PassDetail.h | 13 + .../GPU/Transforms/RemoveReduntantLoops.cpp | 313 ++++++++++++++++++ .../Linalg/Transforms/LinalgPromotion.cpp | 7 +- midend/lib/Pipelines/CMakeLists.txt | 1 + midend/lib/Pipelines/LinalgMemrefOpt.cpp | 4 + 18 files changed, 656 insertions(+), 8 deletions(-) create mode 100644 midend/include/Dialect/GPU/CMakeLists.txt create mode 100644 midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h create mode 100644 midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h create mode 100644 midend/lib/Dialect/GPU/Transforms/CMakeLists.txt create mode 100644 midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp create mode 100644 midend/lib/Dialect/GPU/Transforms/PassDetail.h create mode 100644 midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt index 9bae4c7796..2f98ccaba0 100644 --- a/midend/include/Dialect/CMakeLists.txt +++ b/midend/include/Dialect/CMakeLists.txt @@ -6,4 +6,5 @@ add_subdirectory(VectorExp) add_subdirectory(Gemmini) add_subdirectory(Sche) add_subdirectory(Transform) -add_subdirectory(Linalg) \ No newline at end of file +add_subdirectory(Linalg) +add_subdirectory(GPU) \ No newline at end of file diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt new file mode 100644 index 0000000000..d268af3b42 --- /dev/null +++ b/midend/include/Dialect/GPU/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls) +add_public_tablegen_target(BuddyGPUPassIncGen) \ No newline at end of file diff --git a/midend/include/Dialect/GPU/Passes.h b/midend/include/Dialect/GPU/Passes.h index e69de29bb2..f7dcca88e6 100644 --- a/midend/include/Dialect/GPU/Passes.h +++ b/midend/include/Dialect/GPU/Passes.h @@ -0,0 +1,18 @@ +#ifndef DIALECT_GPU_PASSES_H +#define DIALECT_GPU_PASSES_H + +// Include the constructor of passes in GPU Dialect +#include "GPU/Transforms/GPUDistributeToWarp.h" +#include "GPU/Transforms/RemoveReduntantLoops.h" + +namespace mlir { +// Generate the definition of GPU Passes +#define GEN_PASS_DECL +#include "GPU/Passes.h.inc" + +#define GEN_PASS_REGISTRATION +#include "GPU/Passes.h.inc" + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/GPU/Passes.td b/midend/include/Dialect/GPU/Passes.td index b247a5373f..5c85dcbfd5 100644 --- a/midend/include/Dialect/GPU/Passes.td +++ b/midend/include/Dialect/GPU/Passes.td @@ -1,3 +1,11 @@ include "mlir/Pass/PassBase.td" -def \ No newline at end of file +def GPUDistributeToWarp : Pass<"gpu-distribute-to-warp", "func::FuncOp"> { + let summary = "GPU distribute from Block level to Warp"; + let constructor = "mlir::createGPUDistributeToWarpPass()"; +} + +def RemoveReduntantLoops : Pass<"remove-reduntant-loops", "func::FuncOp"> { + let summary = "Remove the loops that only run once in gpu kernel"; + let constructor = "mlir::createRemoveReduntantLoops()"; +} \ No newline at end of file diff --git a/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h b/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h new file mode 100644 index 0000000000..c48622d822 --- /dev/null +++ b/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h @@ -0,0 +1,15 @@ +#ifndef DIALECT_GPU_TRANSFORMS_GPUDistributeToWarp_H +#define DIALECT_GPU_TRANSFORMS_GPUDistributeToWarp_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/StringRef.h" +#include + +namespace mlir { + +std::unique_ptr> createGPUDistributeToWarpPass(); + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h b/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h new file mode 100644 index 0000000000..7489038db8 --- /dev/null +++ b/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h @@ -0,0 +1,16 @@ +#ifndef DIALECT_GPU_TRANSFORMS_REMOVEREDUNTANTLOOPS_H +#define DIALECT_GPU_TRANSFORMS_REMOVEREDUNTANTLOOPS_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Operation.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/StringRef.h" +#include + +namespace mlir { + +std::unique_ptr> createRemoveReduntantLoops(); + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/Passes.h b/midend/include/Dialect/Linalg/Passes.h index a9982e5404..d4136bde9a 100644 --- a/midend/include/Dialect/Linalg/Passes.h +++ b/midend/include/Dialect/Linalg/Passes.h @@ -12,4 +12,6 @@ namespace mlir { #define GEN_PASS_REGISTRATION #include "Linalg/Passes.h.inc" -} // namespace mlir \ No newline at end of file +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h index 8405e24bb1..30f5b35e66 100644 --- a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h +++ b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h @@ -1,5 +1,5 @@ -#ifndef LINALG_TRANSFORMS_LINALGPROMOTION_H -#define LINALG_TRANSFORMS_LINALGPROMOTION_H +#ifndef DIALECT_LINALG_TRANSFORMS_LINALGPROMOTION_H +#define DIALECT_LINALG_TRANSFORMS_LINALGPROMOTION_H #include "mlir/Pass/Pass.h" #include "llvm/ADT/StringRef.h" diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h index ae7bc8b394..99c0796b07 100644 --- a/midend/include/Dialect/Transform/Passes.h +++ b/midend/include/Dialect/Transform/Passes.h @@ -1,3 +1,6 @@ +#ifndef DIALECT_TRANSFORM_PASSES_H +#define DIALECT_TRANSFORM_PASSES_H + #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassRegistry.h" @@ -14,4 +17,6 @@ class ModuleOp; #define GEN_PASS_REGISTRATION #include "Transform/Passes.h.inc" -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt index f0e2ac6c13..2f98ccaba0 100644 --- a/midend/lib/Dialect/CMakeLists.txt +++ b/midend/lib/Dialect/CMakeLists.txt @@ -7,3 +7,4 @@ add_subdirectory(Gemmini) add_subdirectory(Sche) add_subdirectory(Transform) add_subdirectory(Linalg) +add_subdirectory(GPU) \ No newline at end of file diff --git a/midend/lib/Dialect/GPU/CMakeLists.txt b/midend/lib/Dialect/GPU/CMakeLists.txt index e69de29bb2..5c919f7dfc 100644 --- a/midend/lib/Dialect/GPU/CMakeLists.txt +++ b/midend/lib/Dialect/GPU/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Transforms) \ No newline at end of file diff --git a/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt b/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt new file mode 100644 index 0000000000..58e45c61c1 --- /dev/null +++ b/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_dialect_library(BuddyGPUPasses + GPUDistributeToWarp.cpp + RemoveReduntantLoops.cpp + + DEPENDS + BuddyGPUPassIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass +) diff --git a/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp b/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp new file mode 100644 index 0000000000..0a50cad71b --- /dev/null +++ b/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp @@ -0,0 +1,233 @@ +#include "GPU/Transforms/GPUDistributeToWarp.h" +#include "PassDetail.h" +#include "Utils/GemmCodegenUtils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" +#include +#include +#include + +using namespace mlir; +using namespace mlir::buddy; + +namespace { + +static constexpr int64_t warpSize = 32; + +// get the parallel dim of linalg loops +// if parallel dim = 1 remove it +SmallVector +getPartitionedLoopsFromLinalgOp(linalg::LinalgOp linalgOp) { + SmallVector parallelLoops; + // Return the dims that are parallel loops. + linalgOp.getParallelDims(parallelLoops); + + // LoopRange is the size of each parallel's dim + SmallVector loopRanges = linalgOp.getStaticLoopRanges(); + // remove the dimension in parallelLoops whose range is 1 + parallelLoops = llvm::to_vector( + llvm::make_filter_range(parallelLoops, [=](unsigned loopDim) { + return loopRanges[loopDim] != 1; + })); + return parallelLoops; +} + +// LinalgTilingOptions need this function to compute tile size +// Set this func to LinalgTilingOptions.setTileSizeComputationFunction +// op is LinalgOp +// warp level tile only tile DIMM and DIMN +std::optional> +warpTileSizeComputationFunc(SmallVector warpWorkGroup, + OpBuilder &builder, Operation *op) { + func::FuncOp funcOp = op->getParentOfType(); + // get Block Level tileConfig from the annotation in funcOp's attribute + std::optional> optionalGemmTileSize = + getGemmTileSize(funcOp); + if (!optionalGemmTileSize.has_value()) { + return std::nullopt; + } + SmallVector gemmTileSize = optionalGemmTileSize.value(); + + SmallVector blockTileSize; + auto linalgOp = cast(op); + // blockTileSize gets gemmTileSize's M and N because dimK no need to be tiled + if (linalgOp.getNumParallelLoops() == 3) { // BMM + blockTileSize = {0, gemmTileSize[0], gemmTileSize[1]}; + } else { // Matmul + blockTileSize = {gemmTileSize[0], gemmTileSize[1]}; + } + + SmallVector warpTileSize; + + auto partitionedLoops = getPartitionedLoopsFromLinalgOp(linalgOp); + + auto zero = builder.create(op->getLoc(), 0); + warpTileSize.resize(cast(op).getLoopIteratorTypes().size(), + zero); + // if M N is parallel remove the workGroup[2] + warpWorkGroup.resize(partitionedLoops.size()); + // partitionedLoops is M N which needed to be mapped to WarpIdx.y WarpIdx.x + std::reverse(warpWorkGroup.begin(), warpWorkGroup.end()); + + unsigned idx = 0; + for (auto depth : partitionedLoops) { + if (depth >= blockTileSize.size()) { + continue; + } + warpTileSize[depth] = builder.create( + op->getLoc(), + llvm::divideCeil(blockTileSize[depth], warpWorkGroup[idx++])); + } + return warpTileSize; +} + +SmallVector getWarpInfoDistributedToTile( + OpBuilder &b, Location loc, ArrayRef parallelLoopRanges, + unsigned warpSize, SmallVector warpWorkGroup) { + unsigned parallelDimNum = parallelLoopRanges.size(); + SmallVector warpProcInfo(parallelDimNum); + SmallVector gpuDimAttrs = { + gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}; + for (unsigned i = 0; i < parallelDimNum; i++) { + Value threadIdx_i = b.create(loc, gpuDimAttrs[i]); + Value warpIdx_i; + if (i == 0) { // warpIdx.x = threadIdx.x / 32 + mlir::AffineExpr d0 = b.getAffineDimExpr(0); + warpIdx_i = affine::makeComposedAffineApply( + b, loc, d0.floorDiv(b.getAffineConstantExpr(warpSize)), + {threadIdx_i}); + } else { + mlir::AffineExpr d0 = b.getAffineDimExpr(0); + warpIdx_i = affine::makeComposedAffineApply(b, loc, d0, {threadIdx_i}); + } + // warpProcInfo[i] mapped to the tiled scf dim + warpProcInfo[parallelDimNum - 1 - i] = linalg::ProcInfo{ + warpIdx_i, + b.create(loc, b.getIndexAttr(warpWorkGroup[i])), + linalg::DistributionMethod::Cyclic}; + } + return warpProcInfo; +} + +LogicalResult distributeToWarpLevel(scf::ForallOp forallOp, + SmallVector &workGroup) { + // Use LinalgTilingOptions to distribute to warp level + + // calculate WarpIdx + // WarpIdx.x = threadIdx.x / 32 + // WarpIdx.y = threadIdx.y + // WarpIdx.z = threadIdx.z + if (workGroup[0] / warpSize == 0) { + return failure(); + } + SmallVector warpWorkGroup = {workGroup[0] / warpSize, + workGroup[1], workGroup[2]}; + + linalg::LinalgTilingOptions linalgTilingOptions = + linalg::LinalgTilingOptions(); + + auto tileSizeComputationFunction = [=](OpBuilder &b, Operation *op) { + return warpTileSizeComputationFunc(warpWorkGroup, b, op).value(); + }; + linalgTilingOptions.setTileSizeComputationFunction( + tileSizeComputationFunction); + + linalg::LinalgLoopDistributionOptions distributionOptions; + auto getWarpProcInfoFn = [=](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getWarpInfoDistributedToTile(b, loc, parallelLoopRanges, warpSize, + warpWorkGroup); + }; + distributionOptions.procInfo = getWarpProcInfoFn; + linalgTilingOptions.setDistributionOptions(distributionOptions) + .setLoopType(linalg::LinalgTilingLoopType::Loops); + + SmallVector candidates; + forallOp.walk([&](linalg::LinalgOp linalgOp) { + if (isa(linalgOp) || isLinalgMatmul(linalgOp)) { + candidates.push_back(linalgOp); + } + }); + + IRRewriter rewriter(forallOp->getContext()); + for (auto linalgOp : candidates) { + FailureOr res = + linalg::tileLinalgOp(rewriter, linalgOp, linalgTilingOptions); + if (failed(res)) { + return failure(); + } + if (res->tensorResults.empty()) { + rewriter.eraseOp(linalgOp); + } else { + rewriter.replaceOp(linalgOp, res->tensorResults); + } + } + + return success(); +} + +struct GPUDistributeToWarpPass + : public GPUDistributeToWarpBase { +public: + GPUDistributeToWarpPass() = default; + + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + + if (!funcHasGemm(funcOp)) { + return; + } + + // get BlockSize from the annotation in funcOp's attribute + std::optional> optionalworkGroup = + getGemmBlockSize(funcOp); + if (!optionalworkGroup.has_value()) { + return; + } + SmallVector workGroup = optionalworkGroup.value(); + + std::optional optionalBlockForallOp = + getForallOpMappedToBlock(funcOp); + if (!optionalBlockForallOp.has_value()) { + return; + } + scf::ForallOp forallOp = optionalBlockForallOp.value(); + if (failed(distributeToWarpLevel(forallOp, workGroup))) { + return signalPassFailure(); + } + + { + // Apply canonicalization patterns. + RewritePatternSet threadTilingCanonicalizationPatterns = + linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext()); + // populateAffineMinSCFCanonicalizationPattern( + // threadTilingCanonicalizationPatterns); + if (failed(applyPatternsAndFoldGreedily( + funcOp, std::move(threadTilingCanonicalizationPatterns)))) { + return signalPassFailure(); + } + } + } +}; +} // namespace + +std::unique_ptr> +mlir::createGPUDistributeToWarpPass() { + return std::make_unique(); +} diff --git a/midend/lib/Dialect/GPU/Transforms/PassDetail.h b/midend/lib/Dialect/GPU/Transforms/PassDetail.h new file mode 100644 index 0000000000..4d085863c6 --- /dev/null +++ b/midend/lib/Dialect/GPU/Transforms/PassDetail.h @@ -0,0 +1,13 @@ +#ifndef DIALECT_GPU_TRANSFORMS_PASSDETAIL_H +#define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { + +#define GEN_PASS_CLASSES +#include "GPU/Passes.h.inc" + +} // namespace mlir + +#endif \ No newline at end of file diff --git a/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp b/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp new file mode 100644 index 0000000000..d98f27e12d --- /dev/null +++ b/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp @@ -0,0 +1,313 @@ +// Some code comes from +// compiler/lib/Dialect/SCF/Transforms/RemoveSingleIterationLoop.cpp +// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); + +#include "mlir/Dialect/Affine/Utils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Pass/Pass.h" + +#include "GPU/Transforms/RemoveReduntantLoops.h" +#include "PassDetail.h" +#include "Utils/GemmCodegenUtils.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SmallVector.h" +#include +#include + +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::buddy; + +namespace { + +using GetMinMaxExprFn = + std::function>( + Value value, SmallVectorImpl &dims, + SmallVectorImpl &symbols)>; + +/// The pattern will detect single iteration loops +/// based on the range returned by the lambda +/// |getMinMaxFn| for some know values. + +struct RemoveSingleIterationLoops : public OpRewritePattern { +private: + /// Compose map with apply affine ops and try to simplify it. + static void combineAndSimplifyMap(AffineMap &map, + SmallVectorImpl &dims, + SmallVectorImpl &symbols) { + SmallVector operands(dims.begin(), dims.end()); + operands.append(symbols.begin(), symbols.end()); + // Pull in affine.apply operations and compose them fully into the + // result. + affine::fullyComposeAffineMapAndOperands(&map, &operands); + affine::canonicalizeMapAndOperands(&map, &operands); + map = simplifyAffineMap(map); + // Assign the results. + dims.assign(operands.begin(), operands.begin() + map.getNumDims()); + symbols.assign(operands.begin() + map.getNumDims(), operands.end()); + } + + /// Replace dimensions and symbols with known range in the map expression. + // TODO: Use core function once the interface using a lambda lands. + static AffineMap substituteMin(AffineMap map, SmallVectorImpl &dims, + SmallVectorImpl &symbols, + GetMinMaxExprFn getMinMaxExpr) { + combineAndSimplifyMap(map, dims, symbols); + + auto exprs = llvm::to_vector(map.getResults()); + for (AffineExpr &expr : exprs) { + bool substituted = true; + while (substituted) { + substituted = false; + for (unsigned dimIdx = 0; dimIdx < dims.size(); ++dimIdx) { + Value dim = dims[dimIdx]; + auto minMax = getMinMaxExpr(dim, dims, symbols); + if (!minMax) + continue; + AffineExpr dimExpr = getAffineDimExpr(dimIdx, expr.getContext()); + // Substitute occurrences of `dimExpr` by either the min expression or + // the max expression depending on whether the value is used with a + // positive or negative coefficient. + AffineExpr substitutedExpr = affine::substWithMin( + expr, dimExpr, minMax->first, minMax->second); + substituted = (substitutedExpr != expr); + expr = substitutedExpr; + } + // Substitute symbols + for (unsigned symIdx = 0; symIdx < symbols.size(); ++symIdx) { + Value sym = symbols[symIdx]; + auto minMax = getMinMaxExpr(sym, dims, symbols); + if (!minMax) + continue; + AffineExpr symExpr = getAffineSymbolExpr(symIdx, expr.getContext()); + AffineExpr substitutedExpr = affine::substWithMin( + expr, symExpr, minMax->first, minMax->second); + substituted = (substitutedExpr != expr); + expr = substitutedExpr; + } + } + map = AffineMap::get(dims.size(), symbols.size(), exprs, + exprs.front().getContext()); + // Cleanup and simplify the results. + // This needs to happen outside of the loop iterating on dims.size() since + // it modifies dims. + combineAndSimplifyMap(map, dims, symbols); + // Assign the results. + exprs.assign(map.getResults().begin(), map.getResults().end()); + } + + assert(!exprs.empty() && "Unexpected empty exprs"); + return AffineMap::get(dims.size(), symbols.size(), exprs, map.getContext()); + } + + /// Replaces the given op with the contents of the given single-block region, + /// using the operands of the block terminator to replace operation results. + static void replaceOpWithRegion(PatternRewriter &rewriter, Operation *op, + Region ®ion, ValueRange blockArgs = {}) { + assert(llvm::hasSingleElement(region) && "expected single-block region"); + Block *block = ®ion.front(); + Operation *terminator = block->getTerminator(); + ValueRange results = terminator->getOperands(); + rewriter.inlineBlockBefore(block, op, blockArgs); + rewriter.replaceOp(op, results); + rewriter.eraseOp(terminator); + } + + static void replaceOpWithBlocks(PatternRewriter &rewriter, Operation *op, + Region ®ion, ValueRange blockArgs = {}) { + } + + /// Return true if we can prove that the we always run at least the first + /// iteration of the ForOp. + static bool alwaysRunsFirstIteration(scf::ForOp op, + GetMinMaxExprFn getMinMax) { + // Calculate the minimum value of ub - lb. If it is strictly positive it + // means the loop will always run at least once. + MLIRContext *ctx = op->getContext(); + SmallVector dims; + SmallVector symbols; + AffineExpr lb = getAffineDimExpr(dims.size(), ctx); + dims.push_back(op.getLowerBound()); + AffineExpr ub = getAffineDimExpr(dims.size(), ctx); + dims.push_back(op.getUpperBound()); + AffineExpr iterZero = ub - lb; + auto map = AffineMap::get(dims.size(), 0, iterZero); + AffineMap simplifiedMap = substituteMin(map, dims, symbols, getMinMax); + assert(simplifiedMap.getNumResults() == 1); + auto cst = simplifiedMap.getResult(0).cast(); + if (cst.getValue() > 0) + return true; + return false; + } + + /// Return true if we can prove that the we never run more than one iteration + /// of the ForOp. + static bool neverRunsSecondIteration(scf::ForOp op, + GetMinMaxExprFn getMinMax) { + // Calculate the minimum of lb + step - ub. If it is positive it means the + // loop never run more than once. + MLIRContext *ctx = op->getContext(); + SmallVector dims; + SmallVector symbols; + AffineExpr lb = getAffineDimExpr(dims.size(), ctx); + dims.push_back(op.getLowerBound()); + AffineExpr ub = getAffineDimExpr(dims.size(), ctx); + dims.push_back(op.getUpperBound()); + AffineExpr step = getAffineDimExpr(dims.size(), ctx); + dims.push_back(op.getStep()); + AffineExpr iterOne = lb + step - ub; + auto map = AffineMap::get(dims.size(), 0, iterOne); + AffineMap simplifiedMap = substituteMin(map, dims, symbols, getMinMax); + assert(simplifiedMap.getNumResults() == 1); + auto cst = simplifiedMap.getResult(0).cast(); + if (cst.getValue() >= 0) + return true; + return false; + } + +public: + RemoveSingleIterationLoops(MLIRContext *context, GetMinMaxExprFn getMinMax) + : OpRewritePattern(context, 1), getMinMax(getMinMax) {} + + LogicalResult matchAndRewrite(scf::ForOp forOp, + PatternRewriter &rewriter) const override { + if (!(alwaysRunsFirstIteration(forOp, getMinMax) && + neverRunsSecondIteration(forOp, getMinMax))) { + return failure(); + } + SmallVector blockArgs; + blockArgs.reserve(forOp.getInitArgs().size() + 1); + blockArgs.push_back(forOp.getLowerBound()); + llvm::append_range(blockArgs, forOp.getInitArgs()); + replaceOpWithRegion(rewriter, forOp, forOp.getRegion(), blockArgs); + return success(); + } + +private: + GetMinMaxExprFn getMinMax; +}; + +} // namespace + +namespace { + +/// Converts a symbolic GPU processor dimension to its numeric one. +static unsigned gpuDimToIndex(gpu::Dimension dim) { + switch (dim) { + case gpu::Dimension::x: + return 0; + case gpu::Dimension::y: + return 1; + case gpu::Dimension::z: + return 2; + default: + assert(false && "invalid dimension"); + return 0; + } +} + +/// If the value is a threadID return the range [0, blockGroup-1]. +/// If the number of workgroup is known also return the range of workgroupId ad +/// workGroup. +/// As we only use this function in gemm codegen, we we can assume loop variable +/// is relavant to gpu.threadId or gpu.blockId. +static std::optional> +getWorkgroupRange(Value processorValue, SmallVectorImpl & /*dims*/, + SmallVectorImpl & /*symbols*/, + ArrayRef workGroup, ArrayRef blockGroup) { + OpBuilder builder(processorValue.getContext()); + // If the value is a threadID return the range [0, blockDim.i - 1]. + if (auto idOp = processorValue.getDefiningOp()) { + unsigned index = gpuDimToIndex(idOp.getDimension()); + AffineExpr zero = builder.getAffineConstantExpr(0); + AffineExpr ubExpr = builder.getAffineConstantExpr(workGroup[index]); + return std::make_pair(zero, ubExpr - 1); + } + // If the value is a blockDim return the range [blockGroup, blockGroup]. + if (auto dimOp = processorValue.getDefiningOp()) { + unsigned index = gpuDimToIndex(dimOp.getDimension()); + AffineExpr bound = builder.getAffineConstantExpr(workGroup[index]); + return std::make_pair(bound, bound); + } + // If the value is a blockID return the range [0, blockGroupSize_i - 1]. + if (auto idOp = processorValue.getDefiningOp()) { + unsigned index = gpuDimToIndex(idOp.getDimension()); + AffineExpr zero = builder.getAffineConstantExpr(0); + AffineExpr ubExpr = builder.getAffineConstantExpr(blockGroup[index]); + return std::make_pair(zero, ubExpr - 1); + } + + return std::nullopt; +} + +LogicalResult removeReduntantLoops(func::FuncOp funcOp, + SmallVector workGroup, + SmallVector blockGroup) { + auto getParallelRangeFn = [=](Value processorValue, + SmallVectorImpl &dims, + SmallVectorImpl &symbols) { + return getWorkgroupRange(processorValue, dims, symbols, workGroup, + blockGroup); + }; + RewritePatternSet patterns(funcOp->getContext()); + patterns.add(patterns.getContext(), + getParallelRangeFn); + return applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); +} + +struct RemoveReduntantLoopsPass + : public RemoveReduntantLoopsBase { +public: + RemoveReduntantLoopsPass() = default; + + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + + if (!funcHasGemm(funcOp)) { + return; + } + + std::optional> optionalworkGroup = + getGemmBlockSize(funcOp); + if (!optionalworkGroup.has_value()) { + return; + } + SmallVector workGroup = optionalworkGroup.value(); + + std::optional optionalBlockForallOp = + getForallOpMappedToBlock(funcOp); + if (!optionalBlockForallOp.has_value()) { + return; + } + scf::ForallOp forallOp = optionalBlockForallOp.value(); + auto optionalBlockGroups = + getConstantIntValues(forallOp.getMixedLowerBound()); + SmallVector blockGroup = optionalBlockGroups.value(); + blockGroup.push_back(1); + if (failed(removeReduntantLoops(funcOp, workGroup, blockGroup))) { + return signalPassFailure(); + } + } +}; + +} // namespace + +std::unique_ptr> +mlir::createRemoveReduntantLoops() { + return std::make_unique(); +} \ No newline at end of file diff --git a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp index 31f4225e43..811ea12713 100644 --- a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp +++ b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp @@ -193,11 +193,14 @@ struct LinalgPromotionPass : public LinalgPromotionBase { } promotionImpl(b, linalgContractOp); + b.setInsertionPoint(linalgContractOp); + b.create(linalgContractOp->getLoc()); + b.setInsertionPointAfter(linalgContractOp); + b.create(linalgContractOp->getLoc()); - } }; -} // namespace mlir +} // namespace std::unique_ptr> mlir::createLinalgPromotionPass() { diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt index 59cd289a1c..a6c36eceb0 100644 --- a/midend/lib/Pipelines/CMakeLists.txt +++ b/midend/lib/Pipelines/CMakeLists.txt @@ -9,6 +9,7 @@ add_mlir_library(BuddyPipelines MLIRIR BuddyGPUPipelines BuddyTransformPasses + BuddyGPUPasses BuddyGemmCodegenUtils BuddyPipelineUtils ) \ No newline at end of file diff --git a/midend/lib/Pipelines/LinalgMemrefOpt.cpp b/midend/lib/Pipelines/LinalgMemrefOpt.cpp index a7d5538162..9d9c3c51da 100644 --- a/midend/lib/Pipelines/LinalgMemrefOpt.cpp +++ b/midend/lib/Pipelines/LinalgMemrefOpt.cpp @@ -1,4 +1,6 @@ #include "Pipelines/LinalgMemrefOpt.h" +#include "GPU/Transforms/GPUDistributeToWarp.h" +#include "GPU/Transforms/RemoveReduntantLoops.h" #include "Linalg/Transforms/LinalgPromotion.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Pass/PassManager.h" @@ -17,6 +19,8 @@ void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) { pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createGPUDistributeToWarpPass()); + pm.addNestedPass(createRemoveReduntantLoops()); } void createLinalgMemrefOptPipelineImpl(OpPassManager &pm,