From 0f8c087bd7b5816e9e570197010148608ea3f186 Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Mon, 19 Aug 2024 05:09:20 +0000
Subject: [PATCH 1/7] Buddy GPU GEMM linalg tensor opt - tiling

---
 midend/include/Dialect/CMakeLists.txt         |   1 +
 .../include/Dialect/Transform/CMakeLists.txt  |   3 +
 midend/include/Dialect/Transform/Passes.h     |  17 +
 midend/include/Dialect/Transform/Passes.td    |  10 +
 .../Transforms/TransformDialectInterpreter.h  |  10 +
 .../Transform/Transforms/TransformInsertion.h |  19 ++
 .../MatMulOptimization/CMakeLists.txt         |   2 +
 .../MatMulOptimization/GPU/CMakeLists.txt     |  11 +
 .../GPU/MatmulTilingOpt.cpp                   | 297 ++++++++++++++++++
 midend/lib/Dialect/CMakeLists.txt             |   1 +
 midend/lib/Dialect/Transform/CMakeLists.txt   |   1 +
 .../Transform/Transforms/CMakeLists.txt       |  16 +
 .../TransformDialectInterpreter.cpp           |  45 +++
 .../Transforms/TransformInsertion.cpp         |  88 ++++++
 14 files changed, 521 insertions(+)
 create mode 100644 midend/include/Dialect/Transform/CMakeLists.txt
 create mode 100644 midend/include/Dialect/Transform/Passes.h
 create mode 100644 midend/include/Dialect/Transform/Passes.td
 create mode 100644 midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
 create mode 100644 midend/include/Dialect/Transform/Transforms/TransformInsertion.h
 create mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
 create mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
 create mode 100644 midend/lib/Dialect/Transform/CMakeLists.txt
 create mode 100644 midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
 create mode 100644 midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
 create mode 100644 midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp

diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt
index 8ab8f29f58..7b472d0d2d 100644
--- a/midend/include/Dialect/CMakeLists.txt
+++ b/midend/include/Dialect/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(RVV)
 add_subdirectory(VectorExp)
 add_subdirectory(Gemmini)
 add_subdirectory(Sche)
+add_subdirectory(Transform)
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/CMakeLists.txt b/midend/include/Dialect/Transform/CMakeLists.txt
new file mode 100644
index 0000000000..a63c82dbdf
--- /dev/null
+++ b/midend/include/Dialect/Transform/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(BuddyTransformPassIncGen)
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h
new file mode 100644
index 0000000000..1c11fc4010
--- /dev/null
+++ b/midend/include/Dialect/Transform/Passes.h
@@ -0,0 +1,17 @@
+#pragma once
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+namespace mlir {
+class ModuleOp;
+// Generate the definition of Transform Passes
+#define GEN_PASS_DECL
+#include "Transform/Passes.h.inc"
+
+// Include the constructor of passes in Transform Dialect
+#include "Transform/Transforms/TransformDialectInterpreter.h"
+#include "Transform/Transforms/TransformInsertion.h"
+
+#define GEN_PASS_REGISTRATION
+#include "Transform/Passes.h.inc"
+
+}
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td
new file mode 100644
index 0000000000..f3790312ce
--- /dev/null
+++ b/midend/include/Dialect/Transform/Passes.td
@@ -0,0 +1,10 @@
+include "mlir/Pass/PassBase.td"
+
+// TransformDialectInterpreter
+def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> {
+    let summary = "Apply transform dialect operations one by one";
+    let constructor = "mlir::createTransformDialectInterpreter()";
+    let options = [
+        Option<"eraseAfter", "erase-after", "bool", "false", "erase Transform Ops after applied">
+    ];
+}
diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
new file mode 100644
index 0000000000..d8038c59ec
--- /dev/null
+++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
@@ -0,0 +1,10 @@
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+class ModuleOp;
+
+std::unique_ptr<OperationPass<ModuleOp>> 
+createTransformDialectInterpreter(bool eraseAfter = false);
+
+} // namespace mlir
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
new file mode 100644
index 0000000000..aca6a50177
--- /dev/null
+++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
@@ -0,0 +1,19 @@
+#include "mlir/Pass/Pass.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+class ModuleOp;
+class ImplicitLocOpBuilder;
+
+struct TransformInsertionConfig {
+    std::string funcAnchor;
+    std::string matchPrefix;
+    std::function<bool(Operation *)> opFilter;
+    std::function<void(ImplicitLocOpBuilder &, Operation *, Value)> transformBuilder;
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createGenericTransformInsertionPass(const TransformInsertionConfig &config);
+
+}
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
index 8e726863eb..69405c053e 100644
--- a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
+++ b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
@@ -14,3 +14,5 @@ add_mlir_library(BatchMatMulOptimization
 add_mlir_library(MatMulParallelVectorization
   MatMulParallelVectorization.cpp
 )
+
+add_subdirectory(GPU)
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
new file mode 100644
index 0000000000..03150ba4b0
--- /dev/null
+++ b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_library(BuddyIRGPUMatmulOpt
+  DEPENDS
+  MhloDialect
+  MLIRBufferTransforms
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRTransforms
+  MLIRLinalgExtTransformOps
+  BuddyUtils
+)
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
new file mode 100644
index 0000000000..11daea6998
--- /dev/null
+++ b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
@@ -0,0 +1,297 @@
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
+#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <optional>
+
+namespace mlir {
+
+struct GPUGemmCodegenConfigOptions
+    : public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__buddyir_gpu_tile_gemm")};
+  ListOption<int64_t> tileSizeConfig{
+      *this, "tile-size-config",
+      llvm::cl::desc("An optional tile size config for tile matmul op.")};
+  ListOption<int64_t> workgroupSize{
+      *this, "workgroup-size",
+      llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
+  Option<int64_t> stages{
+      *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
+      llvm::cl::init(3)};
+};
+
+struct GPUGemmGeneralOptions
+    : public PassPipelineOptions<GPUGemmGeneralOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__buddyir_gpu_tile_gemm")};
+};
+
+using namespace mlir;
+
+namespace {
+
+constexpr StringRef getLinalgToGPUAttrName() { return "__buddyir_to_gpu__"; }
+
+constexpr StringRef getLinalgTargetAttrName() { return "__buddyir_target__"; }
+
+template <typename OpClass = ModuleOp, typename Builder, typename... Args>
+void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
+                                 Args &&...args) {
+  if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() &&
+      pm.getOpAnchorName() != OpClass::getOperationName()) {
+    if (pm.getNesting() == OpPassManager::Nesting::Implicit) {
+      builder(pm.nest<OpClass>(), std::forward<Args>(args)...);
+      return;
+    }
+    llvm::report_fatal_error(
+        llvm::Twine("Can't build pass pipeline on expected op type ") +
+        OpClass::getOperationName() + " but got " + pm.getOpAnchorName());
+  } else {
+    builder(pm, std::forward<Args>(args)...);
+  }
+}
+
+void createGPUTileGemmTransformImpl(OpPassManager &pm,
+                                    const std::string &anchor,
+                                    const std::string &prefix) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (!isLinalgOpMatmul(op))
+      return false;
+    return true;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
+    linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
+    Operation *user = *linalgOp->getUsers().begin();
+    bool hasEpilogue = isa<linalg::GenericOp>(user);
+
+    if (hasEpilogue) {
+      setMarker(user, getEpilogueMarker());
+    }
+
+    bool isBMM = linalgOp.getNumParallelLoops() == 3;
+
+    SmallVector<int64_t, 3> tileSizeConfig = getGemmTileSize(funcOp).value();
+
+    auto func = b.create<transform::GetParentOp>(
+        pdlV.getType(), pdlV,
+        /* isolated_from_above */ false,
+        /* allow_empty_results */ false,
+        /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+        /* deduplicate */ false,
+        /* nth_parent */ 1);
+
+    auto anyType = transform::AnyOpType::get(b.getContext());
+    auto linalgFillType = transform::OperationType::get(
+        b.getContext(), linalg::FillOp::getOperationName());
+    auto linalgFill = b.create<transform::MatchOp>(
+        linalgFillType, func, linalg::FillOp::getOperationName());
+
+    Value mmaLevel = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ b.getStringAttr("Threadblock"));
+    Value target = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ b.getStringAttr("nv_sm_80"));
+
+    SmallVector<int64_t> mappingIdx;
+    if (isBMM) {
+      mappingIdx = {2, 1, 0};
+    } else {
+      mappingIdx = {1, 0};
+    }
+    auto mapping = llvm::to_vector(llvm::map_range(
+        mappingIdx, [](int64_t i) { return static_cast<gpu::MappingId>(i); }));
+    auto mappingAttrs = llvm::to_vector(
+        llvm::map_range(mapping, [&](gpu::MappingId dim) -> Attribute {
+          return gpu::GPUBlockMappingAttr::get(b.getContext(), dim);
+        }));
+
+    SmallVector<int64_t> parrallelTileSizes;
+    if (isBMM) {
+      parrallelTileSizes = {1, tileSizeConfig[0], tileSizeConfig[1]};
+    } else {
+      parrallelTileSizes = {tileSizeConfig[0], tileSizeConfig[1]};
+    }
+    Value tiledMatmulOp;
+    if (hasEpilogue) {
+      auto linalgGenericType = transform::OperationType::get(
+          b.getContext(), linalg::GenericOp::getOperationName());
+      auto epilogue = b.create<transform::MatchOp>(
+          linalgGenericType, func,
+          b.getStrArrayAttr({linalg::GenericOp::getOperationName()}),
+          /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(),
+          /*opAttrs=*/
+          b.getDictionaryAttr({NamedAttribute(
+              b.getStringAttr(getEpilogueMarker()), b.getUnitAttr())}),
+          /*filterResultType=*/TypeAttr(),
+          /*filterOperandTYpes=*/ArrayAttr());
+
+      transform::TileUsingForallOp tileOp =
+          b.create<transform::TileUsingForallOp>(
+              /* target */ epilogue,
+              /* staticTileSizes */ parrallelTileSizes,
+              /* ctor tag */ transform::TileSizesSpec(),
+              /* mapping */ b.getArrayAttr(mappingAttrs));
+      transform::FuseIntoContainingOp fuse =
+          b.create<transform::FuseIntoContainingOp>(
+              /* producerOp */ pdlV,
+              /* containingOp */ tileOp.getForallOp());
+      b.create<transform::FuseIntoContainingOp>(
+          /* producerOp */ linalgFill,
+          /* containingOp */ fuse.getNewContainingOp());
+      tiledMatmulOp = fuse.getFusedOp();
+    } else {
+      transform::TileUsingForallOp tileOp =
+          b.create<transform::TileUsingForallOp>(
+              /* target */ pdlV,
+              /* staticTileSizes */ parrallelTileSizes,
+              /* ctor tag */ transform::TileSizesSpec(),
+              /* mapping */ b.getArrayAttr(mappingAttrs));
+
+      b.create<transform::FuseIntoContainingOp>(
+          /* producerOp */ linalgFill,
+          /* containingOp */ tileOp.getForallOp());
+      tiledMatmulOp = tileOp.getTiledOp();
+    }
+
+    SmallVector<int64_t> reductionTileSizes;
+    if (isBMM)
+      reductionTileSizes = {0, 0, 0, tileSizeConfig[2]};
+    else
+      reductionTileSizes = {0, 0, tileSizeConfig[2]};
+    auto tileKMatmulOp =
+        b.create<transform::TileUsingForOp>(tiledMatmulOp, reductionTileSizes);
+    auto matmulKOp = tileKMatmulOp.getTiledLinalgOp();
+    auto forLoops = tileKMatmulOp.getLoops();
+    if (!forLoops.empty()) {
+      b.create<transform::AnnotateOp>(forLoops[0], getMatmulMainLoopMarker(),
+                                      Value());
+    } else {
+      b.create<transform::AnnotateOp>(matmulKOp, getMatmulMainLoopMarker(),
+                                      Value());
+    }
+
+    b.create<transform::AnnotateOp>(matmulKOp, getLinalgMMALevelAttrName(),
+                                    mmaLevel);
+    b.create<transform::AnnotateOp>(matmulKOp, getLinalgTargetAttrName(),
+                                    target);
+    b.create<transform::AnnotateOp>(matmulKOp, getMMAPatternAttrName(),
+                                    Value());
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+} // namespace
+
+void mlir::createGPUTileGemmTransform(OpPassManager &pm,
+                                      const GPUGemmGeneralOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUTileGemmTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix);
+}
+
+namespace {
+
+void createGPUAddGemmCodegenLoweringConfigTransformImpl(
+    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
+    ArrayRef<int64_t> tileSizeConfig, ArrayRef<int64_t> workgroupSize,
+    int64_t stages) {
+
+  SmallVector<int64_t> tileSizeConfigVec{tileSizeConfig};
+  SmallVector<int64_t> workgroupSizeVec{workgroupSize};
+
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+
+  config.opFilter = [=](Operation *op) {
+    if (isLinalgOpMatmul(op)) {
+      // TODO: check if the matmul op is already annotated
+      // TODO: Add different lowering config for different matmul op size
+      return true;
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    // auto linalgOp = llvm::cast<linalg::LinalgOp>(op);
+    auto tileSizeConfigAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+        llvm::map_range(tileSizeConfigVec, [&](int64_t i) -> Attribute {
+          return b.getI64IntegerAttr(i);
+        })));
+    auto workgroupSizeAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+        llvm::map_range(workgroupSizeVec, [&](int64_t i) -> Attribute {
+          return b.getI64IntegerAttr(i);
+        })));
+    auto stagesAttr = b.getI64IntegerAttr(stages);
+
+    auto func = b.create<transform::GetParentOp>(
+        pdlV.getType(), pdlV,
+        /* isolated_from_above */ true,
+        /* allow_empty_results */ false,
+        /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+        /* deduplicate */ false,
+        /* nth_parent */ 1);
+
+    Value tileSizeConfigValue = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ tileSizeConfigAttrs);
+    Value workgroupSizeValue = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ workgroupSizeAttrs);
+    Value stagesValue = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ stagesAttr);
+
+    b.create<transform::AnnotateOp>(func, getGemmTileConfigAttrName(),
+                                    tileSizeConfigValue);
+    b.create<transform::AnnotateOp>(func, getGemmBlockSizeAttrName(),
+                                    workgroupSizeValue);
+    b.create<transform::AnnotateOp>(func, getGemmPipelineDepthAttrName(),
+                                    stagesValue);
+  };
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+} // namespace
+
+void mlir::createGPUAddGemmCodegenLoweringConfigTransform(
+    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) {
+  invokeOpPassPipelineBuilder(
+      createGPUAddGemmCodegenLoweringConfigTransformImpl, pm,
+      options.funcAnchor, options.annotatePrefix, options.tileSizeConfig,
+      options.workgroupSize, options.stages);
+}
+
+}
\ No newline at end of file
diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt
index 8ab8f29f58..304de8570c 100644
--- a/midend/lib/Dialect/CMakeLists.txt
+++ b/midend/lib/Dialect/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(RVV)
 add_subdirectory(VectorExp)
 add_subdirectory(Gemmini)
 add_subdirectory(Sche)
+add_subdirectory(Transform)
diff --git a/midend/lib/Dialect/Transform/CMakeLists.txt b/midend/lib/Dialect/Transform/CMakeLists.txt
new file mode 100644
index 0000000000..5c919f7dfc
--- /dev/null
+++ b/midend/lib/Dialect/Transform/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000..ac0fe3b197
--- /dev/null
+++ b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_dialect_library(BuddyTransformPasses
+  TransformDialectInterpreter.cpp
+  TransformInsertion.cpp
+
+  DEPENDS
+  BuddyTransformPassIncGen
+#   MLIRLinalgExtTransformOps
+#   MLIRTransformExtOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+  MLIRPDLDialect
+  MLIRTransformDialect
+#   MLIRLinalgExtTransformOps
+)
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
new file mode 100644
index 0000000000..24f81e5a89
--- /dev/null
+++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
@@ -0,0 +1,45 @@
+#define GEN_PASS_DEF_TRANSFORMDIALECTINTERPRETER
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "Transform/Passes.h"
+#include "mlir/IR/BuiltinOps.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TransformDialectInterpreterPass
+    : public impl::TransformDialectInterpreterBase<TransformDialectInterpreterPass> {
+
+    explicit TransformDialectInterpreterPass(bool erase)
+        : TransformDialectInterpreterBase() {
+        eraseAfter = erase;
+    }
+
+    void runOnOperation() override {
+        ModuleOp module = getOperation();
+        for (auto op : module.getOps<transform::TransformOpInterface>()) {
+            RaggedArray<transform::MappedValue> extraMappings;
+            if (failed(transform::applyTransforms(
+                module, op, extraMappings,
+                transform::TransformOptions().enableExpensiveChecks(false)))) {
+                return signalPassFailure();
+            }
+        }
+        if (eraseAfter) {
+            module.walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+                if (isa<transform::TransformOpInterface>(nestedOp)) {
+                    nestedOp->erase();
+                    return WalkResult::skip();
+                }
+                return WalkResult::advance();
+            });
+        }
+    }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createTransformDialectInterpreter(bool eraseAfter = false) {
+    return std::make_unique<TransformDialectInterpreterPass>(eraseAfter);
+}
\ No newline at end of file
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
new file mode 100644
index 0000000000..6595b8347c
--- /dev/null
+++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -0,0 +1,88 @@
+#include "Transform/Passes.h"
+
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+
+#include <optional>
+#include <vector>
+
+using namespace mlir;
+using namespace llvm;
+
+namespace {
+
+void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, 
+                        const TransformInsertionConfig &config) {
+    funcOp->walk([&](Operation *op) { 
+        if (config.opFilter(op)) {
+            ImplicitLocOpBuilder b(op->getLoc(), builder);
+            MLIRContext *ctx = b.getContext();
+
+            auto annotation = getAnnotationUniqueIdentifier(config.matchPrefix);
+            op->setAttr(annotation, UnitAttr::get(ctx));
+
+            auto pdlOperationType = pdl::OperationType::get(ctx);
+            b.create<transform::SequenceOp>(
+                TypeRange(), transform::FailurePropagationMode::Propagate, pdlOperationType, 
+                [&](OpBuilder &b, Location loc, Value blockArg) {
+                    auto annotationAttr = DictionaryAttr::get(
+                        ctx, b.getNamedAttr(annotation, UnitAttr::get(ctx)));
+                    auto match = b.create<transform::MatchOp>(
+                        loc, blockArg.getType(), blockArg, ArrayAttr(),
+                        transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr(),
+                        ArrayAttr());
+                    ImplicitLocOpBuilder ib(loc, b);
+                    config.transformBuilder(ib, op, match);
+                    b.create<transform::YieldOp>(loc);
+                });
+        }
+    });
+}
+
+void insertTransformIR(ModuleOp module, const TransformInsertionConfig &config) {
+    OpBuilder builder = OpBuilder::atBlockEnd(module.getBody());
+    for (auto funcOp : module.getOps<func::FuncOp>()) {
+        if (!config.funcAnchor.empty() && !funcOp->hasAttr(config.funcAnchor)) {
+            continue;
+        }
+        insertTransformIR(funcOp, builder, config);
+    }
+}
+
+struct GenericTransformInsertionPass
+    : public PassWrapper<GenericTransformInsertionPass, OperationPass<ModuleOp>> {
+    MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GenericTransformInsertionPass)
+
+    GenericTransformInsertionPass(const TransformInsertionConfig &config) : config(config) {}
+
+    void getDependentDialects(DialectRegistry &registry) const override {
+        registry.insert<transform::TransformDialect>();
+    }
+
+    void runOnOperation() override {
+        insertTransformIR(getOperation(), config);
+    }
+
+protected:
+  TransformInsertionConfig config;
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createGenericTransformInsertionPass(const TransformInsertionConfig &config) {
+    return std::make_unique<GenericTransformInsertionPass>(config);
+}
\ No newline at end of file

From 7a18985508eb4815582f3241e6e95424eae70ff9 Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Wed, 21 Aug 2024 09:49:54 +0000
Subject: [PATCH 2/7] fix bug

---
 midend/include/Dialect/Transform/Passes.h     |  10 +-
 midend/include/Dialect/Transform/Passes.td    |   2 +-
 .../Transforms/TransformDialectInterpreter.h  |   1 +
 .../Transform/Transforms/TransformInsertion.h |   1 +
 .../MatMulOptimization/CMakeLists.txt         |   4 +-
 .../MatMulOptimization/GPU/CMakeLists.txt     |  11 -
 .../GPU/MatmulTilingOpt.cpp                   | 297 ------------------
 .../Dialect/Transform/Transforms/PassDetail.h |  13 +
 .../TransformDialectInterpreter.cpp           |  14 +-
 .../Transforms/TransformInsertion.cpp         |  16 +-
 10 files changed, 42 insertions(+), 327 deletions(-)
 delete mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
 delete mode 100644 midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
 create mode 100644 midend/lib/Dialect/Transform/Transforms/PassDetail.h

diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h
index 1c11fc4010..ae7bc8b394 100644
--- a/midend/include/Dialect/Transform/Passes.h
+++ b/midend/include/Dialect/Transform/Passes.h
@@ -1,16 +1,16 @@
-#pragma once
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
+
+// Include the constructor of passes in Transform Dialect
+#include "Transform/Transforms/TransformDialectInterpreter.h"
+#include "Transform/Transforms/TransformInsertion.h"
+
 namespace mlir {
 class ModuleOp;
 // Generate the definition of Transform Passes
 #define GEN_PASS_DECL
 #include "Transform/Passes.h.inc"
 
-// Include the constructor of passes in Transform Dialect
-#include "Transform/Transforms/TransformDialectInterpreter.h"
-#include "Transform/Transforms/TransformInsertion.h"
-
 #define GEN_PASS_REGISTRATION
 #include "Transform/Passes.h.inc"
 
diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td
index f3790312ce..104523d2e8 100644
--- a/midend/include/Dialect/Transform/Passes.td
+++ b/midend/include/Dialect/Transform/Passes.td
@@ -1,7 +1,7 @@
 include "mlir/Pass/PassBase.td"
 
 // TransformDialectInterpreter
-def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> {
+def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "mlir::ModuleOp"> {
     let summary = "Apply transform dialect operations one by one";
     let constructor = "mlir::createTransformDialectInterpreter()";
     let options = [
diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
index d8038c59ec..fcd54ba8c1 100644
--- a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
+++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
@@ -2,6 +2,7 @@
 #include <memory>
 
 namespace mlir {
+    
 class ModuleOp;
 
 std::unique_ptr<OperationPass<ModuleOp>> 
diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
index aca6a50177..22ad91c09e 100644
--- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
+++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
@@ -3,6 +3,7 @@
 #include <string>
 
 namespace mlir {
+
 class ModuleOp;
 class ImplicitLocOpBuilder;
 
diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
index 69405c053e..c8695e600d 100644
--- a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
+++ b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt
@@ -13,6 +13,4 @@ add_mlir_library(BatchMatMulOptimization
 
 add_mlir_library(MatMulParallelVectorization
   MatMulParallelVectorization.cpp
-)
-
-add_subdirectory(GPU)
\ No newline at end of file
+)
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
deleted file mode 100644
index 03150ba4b0..0000000000
--- a/midend/lib/Conversion/MatMulOptimization/GPU/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-add_mlir_library(BuddyIRGPUMatmulOpt
-  DEPENDS
-  MhloDialect
-  MLIRBufferTransforms
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRTransforms
-  MLIRLinalgExtTransformOps
-  BuddyUtils
-)
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp b/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
deleted file mode 100644
index 11daea6998..0000000000
--- a/midend/lib/Conversion/MatMulOptimization/GPU/MatmulTilingOpt.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Pass/PassOptions.h"
-#include "mlir/Pass/PassRegistry.h"
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
-#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
-#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "llvm/ADT/SmallSet.h"
-
-#include <optional>
-
-namespace mlir {
-
-struct GPUGemmCodegenConfigOptions
-    : public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
-  Option<std::string> funcAnchor{
-      *this, "func-anchor",
-      llvm::cl::desc(
-          "An optional Unit attribute anchoring on target functions."),
-      llvm::cl::init("")};
-  Option<std::string> annotatePrefix{
-      *this, "annotate-prefix",
-      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
-      llvm::cl::init("__buddyir_gpu_tile_gemm")};
-  ListOption<int64_t> tileSizeConfig{
-      *this, "tile-size-config",
-      llvm::cl::desc("An optional tile size config for tile matmul op.")};
-  ListOption<int64_t> workgroupSize{
-      *this, "workgroup-size",
-      llvm::cl::desc("An optional workgroup size config for tile matmul op.")};
-  Option<int64_t> stages{
-      *this, "stages", llvm::cl::desc("An optional stages for tile matmul op."),
-      llvm::cl::init(3)};
-};
-
-struct GPUGemmGeneralOptions
-    : public PassPipelineOptions<GPUGemmGeneralOptions> {
-  Option<std::string> funcAnchor{
-      *this, "func-anchor",
-      llvm::cl::desc(
-          "An optional Unit attribute anchoring on target functions."),
-      llvm::cl::init("")};
-  Option<std::string> annotatePrefix{
-      *this, "annotate-prefix",
-      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
-      llvm::cl::init("__buddyir_gpu_tile_gemm")};
-};
-
-using namespace mlir;
-
-namespace {
-
-constexpr StringRef getLinalgToGPUAttrName() { return "__buddyir_to_gpu__"; }
-
-constexpr StringRef getLinalgTargetAttrName() { return "__buddyir_target__"; }
-
-template <typename OpClass = ModuleOp, typename Builder, typename... Args>
-void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
-                                 Args &&...args) {
-  if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() &&
-      pm.getOpAnchorName() != OpClass::getOperationName()) {
-    if (pm.getNesting() == OpPassManager::Nesting::Implicit) {
-      builder(pm.nest<OpClass>(), std::forward<Args>(args)...);
-      return;
-    }
-    llvm::report_fatal_error(
-        llvm::Twine("Can't build pass pipeline on expected op type ") +
-        OpClass::getOperationName() + " but got " + pm.getOpAnchorName());
-  } else {
-    builder(pm, std::forward<Args>(args)...);
-  }
-}
-
-void createGPUTileGemmTransformImpl(OpPassManager &pm,
-                                    const std::string &anchor,
-                                    const std::string &prefix) {
-  TransformInsertionConfig config;
-  config.funcAnchor = anchor;
-  config.matchPrefix = prefix;
-  config.opFilter = [=](Operation *op) {
-    if (!isLinalgOpMatmul(op))
-      return false;
-    return true;
-  };
-
-  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
-                                Value pdlV) {
-    func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
-    linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
-    Operation *user = *linalgOp->getUsers().begin();
-    bool hasEpilogue = isa<linalg::GenericOp>(user);
-
-    if (hasEpilogue) {
-      setMarker(user, getEpilogueMarker());
-    }
-
-    bool isBMM = linalgOp.getNumParallelLoops() == 3;
-
-    SmallVector<int64_t, 3> tileSizeConfig = getGemmTileSize(funcOp).value();
-
-    auto func = b.create<transform::GetParentOp>(
-        pdlV.getType(), pdlV,
-        /* isolated_from_above */ false,
-        /* allow_empty_results */ false,
-        /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
-        /* deduplicate */ false,
-        /* nth_parent */ 1);
-
-    auto anyType = transform::AnyOpType::get(b.getContext());
-    auto linalgFillType = transform::OperationType::get(
-        b.getContext(), linalg::FillOp::getOperationName());
-    auto linalgFill = b.create<transform::MatchOp>(
-        linalgFillType, func, linalg::FillOp::getOperationName());
-
-    Value mmaLevel = b.create<transform::ParamConstantOp>(
-        /* type */ pdl::AttributeType::get(b.getContext()),
-        /* value */ b.getStringAttr("Threadblock"));
-    Value target = b.create<transform::ParamConstantOp>(
-        /* type */ pdl::AttributeType::get(b.getContext()),
-        /* value */ b.getStringAttr("nv_sm_80"));
-
-    SmallVector<int64_t> mappingIdx;
-    if (isBMM) {
-      mappingIdx = {2, 1, 0};
-    } else {
-      mappingIdx = {1, 0};
-    }
-    auto mapping = llvm::to_vector(llvm::map_range(
-        mappingIdx, [](int64_t i) { return static_cast<gpu::MappingId>(i); }));
-    auto mappingAttrs = llvm::to_vector(
-        llvm::map_range(mapping, [&](gpu::MappingId dim) -> Attribute {
-          return gpu::GPUBlockMappingAttr::get(b.getContext(), dim);
-        }));
-
-    SmallVector<int64_t> parrallelTileSizes;
-    if (isBMM) {
-      parrallelTileSizes = {1, tileSizeConfig[0], tileSizeConfig[1]};
-    } else {
-      parrallelTileSizes = {tileSizeConfig[0], tileSizeConfig[1]};
-    }
-    Value tiledMatmulOp;
-    if (hasEpilogue) {
-      auto linalgGenericType = transform::OperationType::get(
-          b.getContext(), linalg::GenericOp::getOperationName());
-      auto epilogue = b.create<transform::MatchOp>(
-          linalgGenericType, func,
-          b.getStrArrayAttr({linalg::GenericOp::getOperationName()}),
-          /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(),
-          /*opAttrs=*/
-          b.getDictionaryAttr({NamedAttribute(
-              b.getStringAttr(getEpilogueMarker()), b.getUnitAttr())}),
-          /*filterResultType=*/TypeAttr(),
-          /*filterOperandTYpes=*/ArrayAttr());
-
-      transform::TileUsingForallOp tileOp =
-          b.create<transform::TileUsingForallOp>(
-              /* target */ epilogue,
-              /* staticTileSizes */ parrallelTileSizes,
-              /* ctor tag */ transform::TileSizesSpec(),
-              /* mapping */ b.getArrayAttr(mappingAttrs));
-      transform::FuseIntoContainingOp fuse =
-          b.create<transform::FuseIntoContainingOp>(
-              /* producerOp */ pdlV,
-              /* containingOp */ tileOp.getForallOp());
-      b.create<transform::FuseIntoContainingOp>(
-          /* producerOp */ linalgFill,
-          /* containingOp */ fuse.getNewContainingOp());
-      tiledMatmulOp = fuse.getFusedOp();
-    } else {
-      transform::TileUsingForallOp tileOp =
-          b.create<transform::TileUsingForallOp>(
-              /* target */ pdlV,
-              /* staticTileSizes */ parrallelTileSizes,
-              /* ctor tag */ transform::TileSizesSpec(),
-              /* mapping */ b.getArrayAttr(mappingAttrs));
-
-      b.create<transform::FuseIntoContainingOp>(
-          /* producerOp */ linalgFill,
-          /* containingOp */ tileOp.getForallOp());
-      tiledMatmulOp = tileOp.getTiledOp();
-    }
-
-    SmallVector<int64_t> reductionTileSizes;
-    if (isBMM)
-      reductionTileSizes = {0, 0, 0, tileSizeConfig[2]};
-    else
-      reductionTileSizes = {0, 0, tileSizeConfig[2]};
-    auto tileKMatmulOp =
-        b.create<transform::TileUsingForOp>(tiledMatmulOp, reductionTileSizes);
-    auto matmulKOp = tileKMatmulOp.getTiledLinalgOp();
-    auto forLoops = tileKMatmulOp.getLoops();
-    if (!forLoops.empty()) {
-      b.create<transform::AnnotateOp>(forLoops[0], getMatmulMainLoopMarker(),
-                                      Value());
-    } else {
-      b.create<transform::AnnotateOp>(matmulKOp, getMatmulMainLoopMarker(),
-                                      Value());
-    }
-
-    b.create<transform::AnnotateOp>(matmulKOp, getLinalgMMALevelAttrName(),
-                                    mmaLevel);
-    b.create<transform::AnnotateOp>(matmulKOp, getLinalgTargetAttrName(),
-                                    target);
-    b.create<transform::AnnotateOp>(matmulKOp, getMMAPatternAttrName(),
-                                    Value());
-  };
-
-  pm.addPass(createGenericTransformInsertionPass(config));
-}
-
-} // namespace
-
-void mlir::createGPUTileGemmTransform(OpPassManager &pm,
-                                      const GPUGemmGeneralOptions &options) {
-  invokeOpPassPipelineBuilder(createGPUTileGemmTransformImpl, pm,
-                              options.funcAnchor, options.annotatePrefix);
-}
-
-namespace {
-
-void createGPUAddGemmCodegenLoweringConfigTransformImpl(
-    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
-    ArrayRef<int64_t> tileSizeConfig, ArrayRef<int64_t> workgroupSize,
-    int64_t stages) {
-
-  SmallVector<int64_t> tileSizeConfigVec{tileSizeConfig};
-  SmallVector<int64_t> workgroupSizeVec{workgroupSize};
-
-  TransformInsertionConfig config;
-  config.funcAnchor = anchor;
-  config.matchPrefix = prefix;
-
-  config.opFilter = [=](Operation *op) {
-    if (isLinalgOpMatmul(op)) {
-      // TODO: check if the matmul op is already annotated
-      // TODO: Add different lowering config for different matmul op size
-      return true;
-    }
-    return false;
-  };
-
-  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
-                                Value pdlV) {
-    // auto linalgOp = llvm::cast<linalg::LinalgOp>(op);
-    auto tileSizeConfigAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
-        llvm::map_range(tileSizeConfigVec, [&](int64_t i) -> Attribute {
-          return b.getI64IntegerAttr(i);
-        })));
-    auto workgroupSizeAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
-        llvm::map_range(workgroupSizeVec, [&](int64_t i) -> Attribute {
-          return b.getI64IntegerAttr(i);
-        })));
-    auto stagesAttr = b.getI64IntegerAttr(stages);
-
-    auto func = b.create<transform::GetParentOp>(
-        pdlV.getType(), pdlV,
-        /* isolated_from_above */ true,
-        /* allow_empty_results */ false,
-        /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
-        /* deduplicate */ false,
-        /* nth_parent */ 1);
-
-    Value tileSizeConfigValue = b.create<transform::ParamConstantOp>(
-        /* type */ pdl::AttributeType::get(b.getContext()),
-        /* value */ tileSizeConfigAttrs);
-    Value workgroupSizeValue = b.create<transform::ParamConstantOp>(
-        /* type */ pdl::AttributeType::get(b.getContext()),
-        /* value */ workgroupSizeAttrs);
-    Value stagesValue = b.create<transform::ParamConstantOp>(
-        /* type */ pdl::AttributeType::get(b.getContext()),
-        /* value */ stagesAttr);
-
-    b.create<transform::AnnotateOp>(func, getGemmTileConfigAttrName(),
-                                    tileSizeConfigValue);
-    b.create<transform::AnnotateOp>(func, getGemmBlockSizeAttrName(),
-                                    workgroupSizeValue);
-    b.create<transform::AnnotateOp>(func, getGemmPipelineDepthAttrName(),
-                                    stagesValue);
-  };
-  pm.addPass(createGenericTransformInsertionPass(config));
-}
-} // namespace
-
-void mlir::createGPUAddGemmCodegenLoweringConfigTransform(
-    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) {
-  invokeOpPassPipelineBuilder(
-      createGPUAddGemmCodegenLoweringConfigTransformImpl, pm,
-      options.funcAnchor, options.annotatePrefix, options.tileSizeConfig,
-      options.workgroupSize, options.stages);
-}
-
-}
\ No newline at end of file
diff --git a/midend/lib/Dialect/Transform/Transforms/PassDetail.h b/midend/lib/Dialect/Transform/Transforms/PassDetail.h
new file mode 100644
index 0000000000..0c4840f1d6
--- /dev/null
+++ b/midend/lib/Dialect/Transform/Transforms/PassDetail.h
@@ -0,0 +1,13 @@
+#ifndef TRANSFORM_PASSDETAIL_H
+#define TRANSFORM_PASSDETAIL_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+#define GEN_PASS_CLASSES
+#include "Transform/Passes.h.inc"
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
index 24f81e5a89..2890724998 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
@@ -1,15 +1,17 @@
-#define GEN_PASS_DEF_TRANSFORMDIALECTINTERPRETER
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "Transform/Passes.h"
+#include "Transform/Transforms/TransformDialectInterpreter.h"
+
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/IR/BuiltinOps.h"
 
+#include "PassDetail.h"
+
 using namespace mlir;
 
 namespace {
 
 struct TransformDialectInterpreterPass
-    : public impl::TransformDialectInterpreterBase<TransformDialectInterpreterPass> {
-
+    : public TransformDialectInterpreterBase<TransformDialectInterpreterPass> {
+    
     explicit TransformDialectInterpreterPass(bool erase)
         : TransformDialectInterpreterBase() {
         eraseAfter = erase;
@@ -40,6 +42,6 @@ struct TransformDialectInterpreterPass
 } // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-createTransformDialectInterpreter(bool eraseAfter = false) {
+mlir::createTransformDialectInterpreter(bool eraseAfter) {
     return std::make_unique<TransformDialectInterpreterPass>(eraseAfter);
 }
\ No newline at end of file
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
index 6595b8347c..1e936d8f9b 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -1,7 +1,7 @@
-#include "Transform/Passes.h"
+#include "Transform/Transforms/TransformInsertion.h"
 
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h"
 #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
@@ -16,6 +16,8 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 
+#include "PassDetail.h"
+
 #include <optional>
 #include <vector>
 
@@ -24,6 +26,12 @@ using namespace llvm;
 
 namespace {
 
+inline std::string getAnnotationUniqueIdentifier(const std::string matchPrefix)
+{
+    static size_t cnt = 0;
+    return matchPrefix + "_" + std::to_string(cnt++);
+}
+
 void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, 
                         const TransformInsertionConfig &config) {
     funcOp->walk([&](Operation *op) { 
@@ -42,8 +50,8 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder,
                         ctx, b.getNamedAttr(annotation, UnitAttr::get(ctx)));
                     auto match = b.create<transform::MatchOp>(
                         loc, blockArg.getType(), blockArg, ArrayAttr(),
-                        transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr(),
-                        ArrayAttr());
+                        transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr()
+                        /*ArrayAttr()*/);
                     ImplicitLocOpBuilder ib(loc, b);
                     config.transformBuilder(ib, op, match);
                     b.create<transform::YieldOp>(loc);

From ef66238ef0282e3a88f76a9f8e78551539a6b1e6 Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Wed, 21 Aug 2024 11:24:40 +0000
Subject: [PATCH 3/7] fix Transform Transform Pass

---
 midend/include/Dialect/Transform/Passes.td                 | 2 +-
 .../Dialect/Transform/Transforms/TransformInsertion.h      | 2 +-
 midend/lib/Dialect/Transform/Transforms/CMakeLists.txt     | 3 ---
 .../Transform/Transforms/TransformDialectInterpreter.cpp   | 1 -
 .../Dialect/Transform/Transforms/TransformInsertion.cpp    | 7 +++----
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/midend/include/Dialect/Transform/Passes.td b/midend/include/Dialect/Transform/Passes.td
index 104523d2e8..f3790312ce 100644
--- a/midend/include/Dialect/Transform/Passes.td
+++ b/midend/include/Dialect/Transform/Passes.td
@@ -1,7 +1,7 @@
 include "mlir/Pass/PassBase.td"
 
 // TransformDialectInterpreter
-def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "mlir::ModuleOp"> {
+def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleOp"> {
     let summary = "Apply transform dialect operations one by one";
     let constructor = "mlir::createTransformDialectInterpreter()";
     let options = [
diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
index 22ad91c09e..4d30289bcb 100644
--- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
+++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
@@ -12,7 +12,7 @@ struct TransformInsertionConfig {
     std::string matchPrefix;
     std::function<bool(Operation *)> opFilter;
     std::function<void(ImplicitLocOpBuilder &, Operation *, Value)> transformBuilder;
-}
+};
 
 std::unique_ptr<OperationPass<ModuleOp>>
 createGenericTransformInsertionPass(const TransformInsertionConfig &config);
diff --git a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
index ac0fe3b197..9624c9f651 100644
--- a/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
+++ b/midend/lib/Dialect/Transform/Transforms/CMakeLists.txt
@@ -4,13 +4,10 @@ add_mlir_dialect_library(BuddyTransformPasses
 
   DEPENDS
   BuddyTransformPassIncGen
-#   MLIRLinalgExtTransformOps
-#   MLIRTransformExtOpsIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRPass
   MLIRPDLDialect
   MLIRTransformDialect
-#   MLIRLinalgExtTransformOps
 )
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
index 2890724998..406f5f70f3 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformDialectInterpreter.cpp
@@ -1,5 +1,4 @@
 #include "Transform/Transforms/TransformDialectInterpreter.h"
-
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/IR/BuiltinOps.h"
 
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
index 1e936d8f9b..f54e6d9a09 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -1,5 +1,3 @@
-#include "Transform/Transforms/TransformInsertion.h"
-
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -17,6 +15,7 @@
 #include "mlir/IR/Operation.h"
 
 #include "PassDetail.h"
+#include "Transform/Transforms/TransformInsertion.h"
 
 #include <optional>
 #include <vector>
@@ -91,6 +90,6 @@ struct GenericTransformInsertionPass
 } // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-mlir::createGenericTransformInsertionPass(const TransformInsertionConfig &config) {
+createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) {
     return std::make_unique<GenericTransformInsertionPass>(config);
-}
\ No newline at end of file
+}

From 50a55a96fc63ebbf81a546ce56daa5dddab39c2e Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Thu, 29 Aug 2024 10:06:32 +0800
Subject: [PATCH 4/7] fix bug

---
 CMakeLists.txt                                |  1 +
 examples/BuddyMatmul/linalg-matmul.mlir       | 21 ++++
 midend/include/CMakeLists.txt                 |  2 +-
 midend/include/Dialect/GPU/CMakeLists.txt     |  0
 midend/include/Dialect/GPU/Transforms/Utils.h | 22 +++++
 .../Transforms/TransformDialectInterpreter.h  |  7 +-
 .../Transform/Transforms/TransformInsertion.h |  7 +-
 .../Pipelines/GPU/GemmCodegenTransform.h      | 36 +++++++
 midend/include/Pipelines/GPU/Utils.h          | 32 ++++++
 midend/include/Pipelines/LinalgTensorOpt.h    | 31 ++++++
 midend/lib/CMakeLists.txt                     |  1 +
 midend/lib/Dialect/GPU/CMakeLists.txt         |  0
 .../Transforms/TransformInsertion.cpp         | 11 ++-
 midend/lib/Pipelines/CMakeLists.txt           | 10 ++
 midend/lib/Pipelines/GPU/CMakeLists.txt       | 11 +++
 .../Pipelines/GPU/GemmCodegenTransform.cpp    | 98 +++++++++++++++++++
 midend/lib/Pipelines/GPU/Utils.cpp            | 52 ++++++++++
 midend/lib/Pipelines/LinalgTensorOpt.cpp      | 48 +++++++++
 tools/buddy-opt/CMakeLists.txt                |  3 +
 tools/buddy-opt/buddy-opt.cpp                 |  5 +
 20 files changed, 392 insertions(+), 6 deletions(-)
 create mode 100644 examples/BuddyMatmul/linalg-matmul.mlir
 create mode 100644 midend/include/Dialect/GPU/CMakeLists.txt
 create mode 100644 midend/include/Dialect/GPU/Transforms/Utils.h
 create mode 100644 midend/include/Pipelines/GPU/GemmCodegenTransform.h
 create mode 100644 midend/include/Pipelines/GPU/Utils.h
 create mode 100644 midend/include/Pipelines/LinalgTensorOpt.h
 create mode 100644 midend/lib/Dialect/GPU/CMakeLists.txt
 create mode 100644 midend/lib/Pipelines/CMakeLists.txt
 create mode 100644 midend/lib/Pipelines/GPU/CMakeLists.txt
 create mode 100644 midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
 create mode 100644 midend/lib/Pipelines/GPU/Utils.cpp
 create mode 100644 midend/lib/Pipelines/LinalgTensorOpt.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4863467449..0f440738c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ project(buddy-mlir LANGUAGES CXX C)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_BUILD_TYPE Debug)
 include(ExternalProject)
 
 #-------------------------------------------------------------------------------
diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir
new file mode 100644
index 0000000000..5fbfd77013
--- /dev/null
+++ b/examples/BuddyMatmul/linalg-matmul.mlir
@@ -0,0 +1,21 @@
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module attributes {} {
+  func.func private @Unknown0(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<1280x1280xf16>
+    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16>
+    %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1280x32xf16>, tensor<32x1280xf16>) outs(%1 : tensor<1280x1280xf16>) {
+    ^bb0(%in: f16, %in_0: f16, %out: f16):
+      %3 = arith.mulf %in, %in_0 : f16
+      %4 = arith.addf %out, %3 : f16
+      linalg.yield %4 : f16
+    } -> tensor<1280x1280xf16>
+    return %2 : tensor<1280x1280xf16>
+  }
+  func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> {
+    %0 = call @Unknown0(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16>
+    return %0 : tensor<1280x1280xf16>
+  }
+}
diff --git a/midend/include/CMakeLists.txt b/midend/include/CMakeLists.txt
index 0ca0f41c5a..e6f347c8c1 100644
--- a/midend/include/CMakeLists.txt
+++ b/midend/include/CMakeLists.txt
@@ -1 +1 @@
-add_subdirectory(Dialect)
+add_subdirectory(Dialect)
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/midend/include/Dialect/GPU/Transforms/Utils.h b/midend/include/Dialect/GPU/Transforms/Utils.h
new file mode 100644
index 0000000000..5824dfb5b3
--- /dev/null
+++ b/midend/include/Dialect/GPU/Transforms/Utils.h
@@ -0,0 +1,22 @@
+#ifndef GPU_TRANSFORMS_UTILS_H
+#define GPU_TRANSFORMS_UTILS_H
+
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+
+namespace mlir {
+
+static constexpr StringRef getGemmTileConfigAttrName() {
+    return "__buddy_gemm_tile_config__";
+}
+
+static constexpr StringRef getGemmBlockSizeAttrName() {
+    return "__buddy_gemm_block_size__";
+}
+
+static constexpr StringRef getGemmPipelineStageAttrName() {
+    return "__buddy_gemm_pipeline_stage__";
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
index fcd54ba8c1..d457a1bbe7 100644
--- a/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
+++ b/midend/include/Dialect/Transform/Transforms/TransformDialectInterpreter.h
@@ -1,3 +1,6 @@
+#ifndef TRANSFORM_TRANSFORMS_TRANSFORMDIALECTINTERPRETER_H
+#define TRANSFORM_TRANSFORMS_TRANSFORMDIALECTINTERPRETER_H
+
 #include "mlir/Pass/Pass.h"
 #include <memory>
 
@@ -8,4 +11,6 @@ class ModuleOp;
 std::unique_ptr<OperationPass<ModuleOp>> 
 createTransformDialectInterpreter(bool eraseAfter = false);
 
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
index 4d30289bcb..099bea6a38 100644
--- a/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
+++ b/midend/include/Dialect/Transform/Transforms/TransformInsertion.h
@@ -1,3 +1,6 @@
+#ifndef TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H
+#define TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H
+
 #include "mlir/Pass/Pass.h"
 #include <memory>
 #include <string>
@@ -17,4 +20,6 @@ struct TransformInsertionConfig {
 std::unique_ptr<OperationPass<ModuleOp>>
 createGenericTransformInsertionPass(const TransformInsertionConfig &config);
 
-}
\ No newline at end of file
+} //namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/GPU/GemmCodegenTransform.h b/midend/include/Pipelines/GPU/GemmCodegenTransform.h
new file mode 100644
index 0000000000..6bec6ec46e
--- /dev/null
+++ b/midend/include/Pipelines/GPU/GemmCodegenTransform.h
@@ -0,0 +1,36 @@
+#ifndef PIPELINES_GPU_GEMM_CODEGEN_TRANSOFRM_H
+#define PIPELINES_GPU_GEMM_CODEGEN_TRANSOFRM_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+namespace buddy {
+struct GPUGemmCodegenConfigOptions : public PassPipelineOptions<GPUGemmCodegenConfigOptions> {
+    Option<std::string> funcAnchor {
+        *this, "func-anchor", 
+        llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+    Option<std::string> annotatePrefix {
+        *this, "annotate-prefix",
+        llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+        llvm::cl::init("__buddy_gpu_gemm__")};
+    ListOption<int64_t> tileConfig {
+        *this, "tile-config",
+        llvm::cl::desc("An optional tile config for matmul op")};
+    ListOption<int64_t> workGroup {
+        *this, "work-group",
+        llvm::cl::desc("An optional workgroup size config for matmul op")};
+    Option<int64_t> stages {
+        *this, "stages",
+        llvm::cl::desc("An optional stages config for matmul op")};
+};
+
+void createGPUGemmTileConfigInsertTransform(OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);
+
+} // namespace mlir::buddy
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Pipelines/GPU/Utils.h
new file mode 100644
index 0000000000..857b796618
--- /dev/null
+++ b/midend/include/Pipelines/GPU/Utils.h
@@ -0,0 +1,32 @@
+#ifndef PIPELINES_GPU_UTILS_H
+#define PIPELINES_GPU_UTILS_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+class ModuleOp;
+namespace buddy {
+template <typename OpClass = ModuleOp, typename Builder, typename... Args>
+void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
+                                 Args &&...args) {
+  if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() &&
+      pm.getOpAnchorName() != OpClass::getOperationName()) {
+    if (pm.getNesting() == OpPassManager::Nesting::Implicit) {
+      builder(pm.nest<OpClass>(), std::forward<Args>(args)...);
+      return;
+    }
+    llvm::report_fatal_error(
+        llvm::Twine("Can't build pass pipeline on expected op type ") +
+        OpClass::getOperationName() + " but got " + pm.getOpAnchorName());
+  } else {
+    builder(pm, std::forward<Args>(args)...);
+  }
+}
+
+bool isLinalgMatmul(Operation *op);
+
+} // namespace buddy::pipelines
+} // namespace buddy
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/LinalgTensorOpt.h b/midend/include/Pipelines/LinalgTensorOpt.h
new file mode 100644
index 0000000000..d379f81cb3
--- /dev/null
+++ b/midend/include/Pipelines/LinalgTensorOpt.h
@@ -0,0 +1,31 @@
+#ifndef PIPELINES_GPU_LINALGTENSOROPT_H
+#define PIPELINES_GPU_LINALGTENSOROPT_H
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include <string>
+#include <memory>
+
+namespace mlir {
+namespace buddy {
+
+struct LinalgTensorOptPipelineOptions
+    : public PassPipelineOptions<LinalgTensorOptPipelineOptions> {
+  Option<std::string> target{
+      *this, "target",
+      llvm::cl::desc("An optional attribute to speicify target."),
+      llvm::cl::init("gpu")};
+  Option<std::string> arch{
+      *this, "arch", llvm::cl::desc("An optional attribute to speicify arch."),
+      llvm::cl::init("nv_sm_80")};
+};
+
+void createLinalgTensorOptPassPipeline(OpPassManager &pm, const LinalgTensorOptPipelineOptions &options);
+
+void registerLinalgTensorOptPassPipeline();
+
+    
+} // namespace mlir::buddy
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt
index 19b254cf38..865af316e4 100644
--- a/midend/lib/CMakeLists.txt
+++ b/midend/lib/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(Dialect)
 add_subdirectory(Conversion)
 add_subdirectory(Target)
 add_subdirectory(Utils)
+add_subdirectory(Pipelines)
 
 # Build static library for async runtime.
 add_mlir_library(static_mlir_async_runtime
diff --git a/midend/lib/Dialect/GPU/CMakeLists.txt b/midend/lib/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
index f54e6d9a09..7018174fb8 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -28,13 +28,14 @@ namespace {
 inline std::string getAnnotationUniqueIdentifier(const std::string matchPrefix)
 {
     static size_t cnt = 0;
-    return matchPrefix + "_" + std::to_string(cnt++);
+    return matchPrefix + std::to_string(cnt++);
 }
 
 void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder, 
                         const TransformInsertionConfig &config) {
-    funcOp->walk([&](Operation *op) { 
+    funcOp->walk([&](Operation *op) {
         if (config.opFilter(op)) {
+            op->print(llvm::errs());
             ImplicitLocOpBuilder b(op->getLoc(), builder);
             MLIRContext *ctx = b.getContext();
 
@@ -51,10 +52,14 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder,
                         loc, blockArg.getType(), blockArg, ArrayAttr(),
                         transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr()
                         /*ArrayAttr()*/);
+                    // debug
+                    match->print(llvm::errs());
+                    llvm::errs() << "\n";
                     ImplicitLocOpBuilder ib(loc, b);
                     config.transformBuilder(ib, op, match);
                     b.create<transform::YieldOp>(loc);
                 });
+            funcOp->print(llvm::errs());
         }
     });
 }
@@ -90,6 +95,6 @@ struct GenericTransformInsertionPass
 } // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) {
+mlir::createGenericTransformInsertionPass(const mlir::TransformInsertionConfig &config) {
     return std::make_unique<GenericTransformInsertionPass>(config);
 }
diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt
new file mode 100644
index 0000000000..ed97f0e647
--- /dev/null
+++ b/midend/lib/Pipelines/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_subdirectory(GPU)
+
+add_mlir_library(BuddyPipelines
+    LinalgTensorOpt.cpp
+    GPU/Utils.cpp
+    LINK_LIBS PUBLIC
+    MLIRIR
+    BuddyGPUPipelines
+    BuddyTransformPasses
+)
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/CMakeLists.txt b/midend/lib/Pipelines/GPU/CMakeLists.txt
new file mode 100644
index 0000000000..1f5a151312
--- /dev/null
+++ b/midend/lib/Pipelines/GPU/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_library(BuddyGPUPipelines
+  GemmCodegenTransform.cpp
+  Utils.cpp
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPDLDialect
+  MLIRTransformDialect
+  MLIRTransforms
+  BuddyTransformPasses
+)
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
new file mode 100644
index 0000000000..8bca10bfd8
--- /dev/null
+++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
@@ -0,0 +1,98 @@
+#include "Transform/Transforms/TransformInsertion.h"
+#include "GPU/Transforms/Utils.h"
+#include "Pipelines/GPU/GemmCodegenTransform.h"
+#include "Pipelines/GPU/Utils.h"
+
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
+#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <optional>
+
+using namespace mlir;
+
+namespace {
+
+void createGPUAddGemmCodegenLoweringConfigTransformImpl(
+    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
+    ArrayRef<int64_t> tileConfig, ArrayRef<int64_t> workGroup, int64_t stages) {
+
+    SmallVector<int64_t> vecTileConfig{tileConfig};
+    SmallVector<int64_t> vecWorkGroup{workGroup};
+
+    TransformInsertionConfig config;
+    config.funcAnchor = anchor;
+
+    config.matchPrefix = prefix;
+
+    config.opFilter = [=](Operation *op){
+        if (mlir::buddy::isLinalgMatmul(op)) {
+            return true;
+        }
+        return false;
+    };
+
+    config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) {
+        auto tileConfigAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+            llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute {
+            return b.getI64IntegerAttr(i);
+            })));
+        auto workgroupAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+            llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute {
+            return b.getI64IntegerAttr(i);
+            })));
+        auto stagesAttr = b.getI64IntegerAttr(stages);
+
+        auto func = b.create<transform::GetParentOp>(
+            pdlV.getType(), pdlV,
+            /* isolated_from_above */ true,
+            /* allow_empty_results */ false,
+            /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+            /* deduplicate */ false,
+            /* nth_parent */ 1);
+            
+        Value tileConfigValue = b.create<transform::ParamConstantOp>(
+            /* type */ pdl::AttributeType::get(b.getContext()),
+            /* value */ tileConfigAttrs
+        );
+
+        llvm::errs() << tileConfigValue << "\n";
+
+        Value workGroupValue = b.create<transform::ParamConstantOp>(
+            /* type */ pdl::AttributeType::get(b.getContext()),
+            /* value */ workgroupAttrs
+        );
+
+        Value stagesValue = b.create<transform::ParamConstantOp>(
+            /* type */ pdl::AttributeType::get(b.getContext()),
+            /* value */ stagesAttr
+        );
+
+        b.create<transform::AnnotateOp>(func, getGemmTileConfigAttrName(),
+                                    tileConfigValue);
+        b.create<transform::AnnotateOp>(func, getGemmBlockSizeAttrName(),
+                                        workGroupValue);
+        b.create<transform::AnnotateOp>(func, getGemmPipelineStageAttrName(),
+                                        stagesValue);
+    };
+
+    pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+} // namespace
+
+void mlir::buddy::createGPUGemmTileConfigInsertTransform(
+    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) {
+    mlir::buddy::invokeOpPassPipelineBuilder(
+        createGPUAddGemmCodegenLoweringConfigTransformImpl, pm,
+        options.funcAnchor, options.annotatePrefix, options.tileConfig,
+        options.workGroup, options.stages);
+}
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Pipelines/GPU/Utils.cpp
new file mode 100644
index 0000000000..7298195995
--- /dev/null
+++ b/midend/lib/Pipelines/GPU/Utils.cpp
@@ -0,0 +1,52 @@
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+#include "Pipelines/GPU/Utils.h"
+
+using namespace mlir;
+using namespace llvm;
+
+namespace mlir {
+namespace buddy {
+
+bool isLinalgMatmul(Operation *op) {
+    if (!llvm::isa<linalg::LinalgOp>(op)) {
+        return false;
+    }
+
+    linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
+    if (isa<linalg::MatmulOp>(linalgOp) || isa<linalg::BatchMatmulOp>(linalgOp)) {
+        return true;
+    } else {
+        if (!(linalg::isaContractionOpInterface(linalgOp) &&
+                linalgOp.getNumParallelLoops() >= 2 &&
+                linalgOp.getNumParallelLoops() <= 3)) {
+            return false;
+        }
+        Region &body = linalgOp->getRegion(0);
+        Region::OpIterator it = body.op_begin();
+        while (it != body.op_end() && isa<arith::ExtFOp>(*it)) 
+            it++;
+        if (it == body.op_end() || !isa<arith::MulFOp>(*(it++)))
+            return false;
+        if (it == body.op_end() || !isa<arith::AddFOp>(*(it++)))
+            return false;
+        if (it == body.op_end() || !isa<linalg::YieldOp>(*(it++)))
+            return false;
+        AffineMap outputMap = linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
+        if (outputMap.getNumResults() != outputMap.getNumDims() - 1)
+            return false;
+        OpBuilder b(linalgOp);
+        for (unsigned i = 0; i < outputMap.getNumResults(); i++) {
+            if (outputMap.getResult(i) != b.getAffineDimExpr(i))
+                return false;
+        }
+        return true;
+    }
+}
+
+} // namespace mlir::buddy
+} // namespace mlir
\ No newline at end of file
diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp
new file mode 100644
index 0000000000..a0bce01fd3
--- /dev/null
+++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp
@@ -0,0 +1,48 @@
+#include "Pipelines/LinalgTensorOpt.h"
+#include "Pipelines/GPU/Utils.h"
+#include "Pipelines/GPU/GemmCodegenTransform.h"
+
+#include "Transform/Transforms/TransformDialectInterpreter.h"
+#include "Transform/Transforms/TransformInsertion.h"
+
+#include "mlir/Dialect/Linalg/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+void addGPULinalgOptPasses(OpPassManager &pm) {
+    { // Gemm Codegen Linalg Tensor Opt
+        SmallVector<int64_t> tileConfig = {32, 32, 16};
+        SmallVector<int64_t> workGroup = {32, 2, 1};
+        int64_t stages = 3;
+        mlir::buddy::GPUGemmCodegenConfigOptions configOption;
+        configOption.tileConfig = tileConfig;
+        configOption.workGroup = workGroup;
+        configOption.stages = stages;
+        createGPUGemmTileConfigInsertTransform(pm, configOption);
+        pm.addPass(createTransformDialectInterpreter(true));
+    }
+}
+
+void createLinalgTensorOptPassPipelineimpl(OpPassManager &pm,
+                                            const std::string &target,
+                                            const std::string &arch) {
+    if (target == "gpu") {
+        addGPULinalgOptPasses(pm);
+    } else {
+        /*TODO*/
+    }
+}
+
+} // namespace
+
+void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm,
+                                                    const LinalgTensorOptPipelineOptions &options) {
+    mlir::buddy::invokeOpPassPipelineBuilder(createLinalgTensorOptPassPipelineimpl, pm, options.target, options.arch);
+}
+
+void mlir::buddy::registerLinalgTensorOptPassPipeline() {
+    PassPipelineRegistration<LinalgTensorOptPipelineOptions>(
+        "linalg-tensor-opt", "Linalg with Tensor Opt Pass Pipeline", mlir::buddy::createLinalgTensorOptPassPipeline);
+}
diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt
index 24bcde9359..7a87da7f24 100644
--- a/tools/buddy-opt/CMakeLists.txt
+++ b/tools/buddy-opt/CMakeLists.txt
@@ -37,4 +37,7 @@ target_link_libraries(buddy-opt
   SchedulingOnDevices
   LowerSche
   FuncBufferizeDynamicOffset
+  BuddyPipelines
+  BuddyGPUPipelines
+  BuddyTransformPasses
   )
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp
index bea9513b5e..fc96aa8bae 100644
--- a/tools/buddy-opt/buddy-opt.cpp
+++ b/tools/buddy-opt/buddy-opt.cpp
@@ -47,6 +47,7 @@
 #include "Gemmini/GemminiOps.h"
 #include "Sche/ScheDialect.h"
 #include "Sche/ScheOps.h"
+#include "Pipelines/LinalgTensorOpt.h"
 
 namespace mlir {
 namespace buddy {
@@ -71,6 +72,7 @@ void registerLowerLinalgToGemminiPass();
 void registerDeviceSchedulePass();
 void registerLowerSchePass();
 void registerFuncBufferizeDynamicOffsetPass();
+void registerLinalgTensorOptPassPipeline();
 } // namespace buddy
 } // namespace mlir
 
@@ -104,6 +106,9 @@ int main(int argc, char **argv) {
   mlir::buddy::registerLowerSchePass();
   mlir::buddy::registerFuncBufferizeDynamicOffsetPass();
 
+  // Register Pipeline Passes
+  mlir::buddy::registerLinalgTensorOptPassPipeline();
+
   mlir::DialectRegistry registry;
   // Register all MLIR core dialects.
   registerAllDialects(registry);

From b9e87d3ddc3d7795c51fcd236b74b17568c6fd7d Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Sat, 7 Sep 2024 14:30:39 +0800
Subject: [PATCH 5/7] add tiling pass of GPU GEMM Codegen

---
 examples/BuddyMatmul/linalg-matmul.mlir       |   4 +-
 midend/include/Dialect/GPU/CMakeLists.txt     |   0
 midend/include/Dialect/GPU/Transforms/Utils.h |  22 --
 .../Pipelines/GPU/GemmCodegenTransform.h      |  19 +-
 midend/include/Pipelines/GPU/Utils.h          |  57 +++++
 .../Transforms/TransformInsertion.cpp         |   6 +-
 .../Pipelines/GPU/GemmCodegenTransform.cpp    | 213 ++++++++++++++++--
 midend/lib/Pipelines/GPU/Utils.cpp            |  57 +++++
 midend/lib/Pipelines/LinalgTensorOpt.cpp      |  16 +-
 9 files changed, 336 insertions(+), 58 deletions(-)
 delete mode 100644 midend/include/Dialect/GPU/CMakeLists.txt
 delete mode 100644 midend/include/Dialect/GPU/Transforms/Utils.h

diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir
index 5fbfd77013..72a70d48aa 100644
--- a/examples/BuddyMatmul/linalg-matmul.mlir
+++ b/examples/BuddyMatmul/linalg-matmul.mlir
@@ -2,7 +2,7 @@
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module attributes {} {
-  func.func private @Unknown0(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} {
+  func.func private @Matmul(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} {
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1280x1280xf16>
     %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16>
@@ -15,7 +15,7 @@ module attributes {} {
     return %2 : tensor<1280x1280xf16>
   }
   func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> {
-    %0 = call @Unknown0(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16>
+    %0 = call @Matmul(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16>
     return %0 : tensor<1280x1280xf16>
   }
 }
diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/midend/include/Dialect/GPU/Transforms/Utils.h b/midend/include/Dialect/GPU/Transforms/Utils.h
deleted file mode 100644
index 5824dfb5b3..0000000000
--- a/midend/include/Dialect/GPU/Transforms/Utils.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef GPU_TRANSFORMS_UTILS_H
-#define GPU_TRANSFORMS_UTILS_H
-
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-
-namespace mlir {
-
-static constexpr StringRef getGemmTileConfigAttrName() {
-    return "__buddy_gemm_tile_config__";
-}
-
-static constexpr StringRef getGemmBlockSizeAttrName() {
-    return "__buddy_gemm_block_size__";
-}
-
-static constexpr StringRef getGemmPipelineStageAttrName() {
-    return "__buddy_gemm_pipeline_stage__";
-}
-
-}
-
-#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/GPU/GemmCodegenTransform.h b/midend/include/Pipelines/GPU/GemmCodegenTransform.h
index 6bec6ec46e..5bb4f9432f 100644
--- a/midend/include/Pipelines/GPU/GemmCodegenTransform.h
+++ b/midend/include/Pipelines/GPU/GemmCodegenTransform.h
@@ -28,7 +28,24 @@ struct GPUGemmCodegenConfigOptions : public PassPipelineOptions<GPUGemmCodegenCo
         llvm::cl::desc("An optional stages config for matmul op")};
 };
 
-void createGPUGemmTileConfigInsertTransform(OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);
+struct GPUGemmGeneralOptions
+    : public PassPipelineOptions<GPUGemmGeneralOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix {
+        *this, "annotate-prefix",
+        llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+        llvm::cl::init("__buddy_gpu_gemm__")};
+};
+
+
+void createGemmTileConfigInsertTransform(OpPassManager &pm, const GPUGemmCodegenConfigOptions &options);
+
+void createGemmTileTransform(OpPassManager &pm,
+                             const GPUGemmGeneralOptions &options);
 
 } // namespace mlir::buddy
 } // namespace mlir
diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Pipelines/GPU/Utils.h
index 857b796618..7821b53c31 100644
--- a/midend/include/Pipelines/GPU/Utils.h
+++ b/midend/include/Pipelines/GPU/Utils.h
@@ -6,7 +6,13 @@
 
 namespace mlir {
 class ModuleOp;
+
+namespace func {
+class FuncOp;
+} // namespace mlir::func
+
 namespace buddy {
+
 template <typename OpClass = ModuleOp, typename Builder, typename... Args>
 void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
                                  Args &&...args) {
@@ -26,6 +32,57 @@ void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
 
 bool isLinalgMatmul(Operation *op);
 
+
+static constexpr StringRef getGemmTileMConfigAttrName() {
+    return "__buddy_gemm_tile_config__M";
+}
+
+static constexpr StringRef getGemmTileNConfigAttrName() {
+    return "__buddy_gemm_tile_config__N";
+}
+
+static constexpr StringRef getGemmTileKConfigAttrName() {
+    return "__buddy_gemm_tile_config__K";
+}
+
+static constexpr StringRef getGemmBlockXSizeAttrName() {
+    return "__buddy_gemm_block_size__X";
+}
+
+static constexpr StringRef getGemmBlockYSizeAttrName() {
+    return "__buddy_gemm_block_size__Y";
+}
+
+static constexpr StringRef getGemmBlockZSizeAttrName() {
+    return "__buddy_gemm_block_size__Z";
+}
+
+static constexpr StringRef getGemmPipelineStageAttrName() {
+    return "__buddy_gemm_pipeline_stage__";
+}
+
+static constexpr StringRef getMatmulKMainLoopMarker() {
+    return "__buddy_gemm_main_loopk__";
+}
+
+constexpr StringRef getLinalgMMALevelAttrName() {
+  return "__buddy_mma_level__";
+}
+
+constexpr StringRef getMMAPatternAttrName() { return "__buddy_mma__"; }
+
+
+std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
+
+std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
+
+std::optional<int64_t> getGemmPipelineStages(func::FuncOp funcOp);
+
+void setMarker(mlir::Operation *op, llvm::StringRef marker);
+
+bool hasMarker(Operation *op, StringRef marker);
+
+
 } // namespace buddy::pipelines
 } // namespace buddy
 
diff --git a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
index 7018174fb8..ed6d6e4d59 100644
--- a/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
+++ b/midend/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -35,7 +35,6 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder,
                         const TransformInsertionConfig &config) {
     funcOp->walk([&](Operation *op) {
         if (config.opFilter(op)) {
-            op->print(llvm::errs());
             ImplicitLocOpBuilder b(op->getLoc(), builder);
             MLIRContext *ctx = b.getContext();
 
@@ -52,14 +51,11 @@ void insertTransformIR(func::FuncOp funcOp, OpBuilder &builder,
                         loc, blockArg.getType(), blockArg, ArrayAttr(),
                         transform::MatchInterfaceEnumAttr(), annotationAttr, TypeAttr()
                         /*ArrayAttr()*/);
-                    // debug
-                    match->print(llvm::errs());
-                    llvm::errs() << "\n";
                     ImplicitLocOpBuilder ib(loc, b);
+                    // pass op's handle to transform
                     config.transformBuilder(ib, op, match);
                     b.create<transform::YieldOp>(loc);
                 });
-            funcOp->print(llvm::errs());
         }
     });
 }
diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
index 8bca10bfd8..0b3c8f3e4f 100644
--- a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
+++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
@@ -1,5 +1,4 @@
 #include "Transform/Transforms/TransformInsertion.h"
-#include "GPU/Transforms/Utils.h"
 #include "Pipelines/GPU/GemmCodegenTransform.h"
 #include "Pipelines/GPU/Utils.h"
 
@@ -18,10 +17,11 @@
 #include <optional>
 
 using namespace mlir;
+using namespace mlir::buddy;
 
 namespace {
 
-void createGPUAddGemmCodegenLoweringConfigTransformImpl(
+void createAddGemmCodegenLoweringConfigTransformImpl(
     OpPassManager &pm, const std::string &anchor, const std::string &prefix,
     ArrayRef<int64_t> tileConfig, ArrayRef<int64_t> workGroup, int64_t stages) {
 
@@ -30,24 +30,24 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl(
 
     TransformInsertionConfig config;
     config.funcAnchor = anchor;
-
     config.matchPrefix = prefix;
-
+    // transform operation takes effect needed to have this op
     config.opFilter = [=](Operation *op){
-        if (mlir::buddy::isLinalgMatmul(op)) {
+        if (isLinalgMatmul(op)) {
             return true;
         }
         return false;
     };
 
+    // pdlV is a handle of op
     config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) {
         auto tileConfigAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
             llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute {
-            return b.getI64IntegerAttr(i);
+                return b.getI64IntegerAttr(i);
             })));
         auto workgroupAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
             llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute {
-            return b.getI64IntegerAttr(i);
+                return b.getI64IntegerAttr(i);
             })));
         auto stagesAttr = b.getI64IntegerAttr(stages);
 
@@ -58,28 +58,55 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl(
             /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
             /* deduplicate */ false,
             /* nth_parent */ 1);
-            
-        Value tileConfigValue = b.create<transform::ParamConstantOp>(
-            /* type */ pdl::AttributeType::get(b.getContext()),
-            /* value */ tileConfigAttrs
+        
+        Value tileConfigValue_M = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[0]
+        );
+
+        Value tileConfigValue_N = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[1]
+        );
+
+        Value tileConfigValue_K = b.create<transform::ParamConstantOp>(
+            // /* type */ pdl::AttributeType::get(b.getContext()),
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[2]
+        );
+
+        Value workGroupValue_X = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[0]
         );
 
-        llvm::errs() << tileConfigValue << "\n";
+        Value workGroupValue_Y = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[1]
+        );
 
-        Value workGroupValue = b.create<transform::ParamConstantOp>(
-            /* type */ pdl::AttributeType::get(b.getContext()),
-            /* value */ workgroupAttrs
+        Value workGroupValue_Z = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[2]
         );
 
         Value stagesValue = b.create<transform::ParamConstantOp>(
-            /* type */ pdl::AttributeType::get(b.getContext()),
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
             /* value */ stagesAttr
         );
 
-        b.create<transform::AnnotateOp>(func, getGemmTileConfigAttrName(),
-                                    tileConfigValue);
-        b.create<transform::AnnotateOp>(func, getGemmBlockSizeAttrName(),
-                                        workGroupValue);
+        b.create<transform::AnnotateOp>(func, getGemmTileMConfigAttrName(),
+                                        tileConfigValue_M);
+        b.create<transform::AnnotateOp>(func, getGemmTileNConfigAttrName(),
+                                        tileConfigValue_N);
+        b.create<transform::AnnotateOp>(func, getGemmTileKConfigAttrName(),
+                                        tileConfigValue_K);
+        b.create<transform::AnnotateOp>(func, getGemmBlockXSizeAttrName(),
+                                        workGroupValue_X);
+        b.create<transform::AnnotateOp>(func, getGemmBlockYSizeAttrName(),
+                                        workGroupValue_Y);
+        b.create<transform::AnnotateOp>(func, getGemmBlockZSizeAttrName(),
+                                        workGroupValue_Z);
         b.create<transform::AnnotateOp>(func, getGemmPipelineStageAttrName(),
                                         stagesValue);
     };
@@ -89,10 +116,150 @@ void createGPUAddGemmCodegenLoweringConfigTransformImpl(
 
 } // namespace
 
-void mlir::buddy::createGPUGemmTileConfigInsertTransform(
+void mlir::buddy::createGemmTileConfigInsertTransform(
     OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) {
-    mlir::buddy::invokeOpPassPipelineBuilder(
-        createGPUAddGemmCodegenLoweringConfigTransformImpl, pm,
+    invokeOpPassPipelineBuilder(
+        createAddGemmCodegenLoweringConfigTransformImpl, pm,
         options.funcAnchor, options.annotatePrefix, options.tileConfig,
         options.workGroup, options.stages);
+}
+
+namespace {
+
+// TODO: Epilogue
+void createGemmTileTransformImpl(OpPassManager &pm,
+                             const std::string &anchor, 
+                             const std::string &prefix) {
+    TransformInsertionConfig config;
+    config.funcAnchor = anchor;
+    config.matchPrefix = prefix;
+    config.opFilter = [=](Operation *op){
+        if (isLinalgMatmul(op)) {
+            return true;
+        }
+        return false;
+    };
+    config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) {
+        func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
+        linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
+
+        SmallVector<int64_t, 3> tileConfig = getGemmTileSize(funcOp).value();
+        SmallVector<int64_t, 3> workGroup = getGemmBlockSize(funcOp).value();
+        int64_t stages = getGemmPipelineStages(funcOp).value();
+
+        bool hasEpilogue =  false;
+
+        auto func = b.create<transform::GetParentOp>(
+            pdlV.getType(), pdlV,
+            /* isolated_from_above */ false,
+            /* allow_empty_results */ false,
+            /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+            /* deduplicate */ false,
+            /* nth_parent */ 1);
+
+        auto linalgFillType = transform::OperationType::get(
+            b.getContext(), linalg::FillOp::getOperationName()
+        );
+        auto linalgFillOp = b.create<transform::MatchOp>(
+            /* resultTypes */ linalgFillType, 
+            /* target */ func, 
+            /* opNames */ linalg::FillOp::getOperationName()
+        );
+
+        SmallVector<int64_t> mappingIdx;
+        bool isBMM = linalgOp.getNumParallelLoops() == 3;
+        if (isBMM) {
+            // 2 -> blockIdx.z 1 -> blockIdx.y 0->blockIdx.x
+            mappingIdx = {2, 1, 0};
+        } else {
+            // 1 -> blockIdx.y 0 -> blockIdx.x
+            mappingIdx = {1, 0};
+        }
+
+        // get GPU BlockIdx mapping
+        auto mapping = llvm::to_vector(llvm::map_range(
+            mappingIdx, 
+            [](int64_t i){return static_cast<gpu::MappingId>(i);
+            }));
+        auto mappingAttrs = llvm::to_vector(llvm::map_range(
+            mapping,
+            [&](gpu::MappingId dim) -> Attribute {
+                return gpu::GPUBlockMappingAttr::get(b.getContext(), dim);
+            }));
+
+        SmallVector<int64_t> parallelTileSizes;
+        if (isBMM) {
+            parallelTileSizes = {1, tileConfig[0], tileConfig[1]};
+        } else {
+            parallelTileSizes = {tileConfig[0], tileConfig[1]};
+        }
+        
+        // tile DimM and DimN and each tile dispathes to block 
+        Value tiledMatmulOp;
+        if (hasEpilogue) {
+            // TODO
+        } else {
+            transform::TileUsingForallOp tiledResultOp = 
+                b.create<transform::TileUsingForallOp>(
+                /* target */ pdlV,
+                /* staticTileSizes */ parallelTileSizes,
+                /* ctor tag */ transform::TileSizesSpec(),
+                /* mapping */ b.getArrayAttr(mappingAttrs)
+                );
+
+            if (linalgFillOp) {
+                b.create<transform::FuseIntoContainingOp>(
+                    /* producerOp */ linalgFillOp,
+                    /* containingOp */ tiledResultOp.getForallOp()
+                );
+            }
+            tiledMatmulOp = tiledResultOp.getTiledOp();
+        }
+                
+        // only tile DimK of the matmul which is dispatched to each block
+        SmallVector<int64_t> reduceTileSize;
+        if (isBMM) {
+            reduceTileSize = {0, 0, 0, tileConfig[2]};
+        } else {
+            reduceTileSize = {0, 0, tileConfig[2]};
+        }
+
+        auto tiledKMatmulOp = 
+            b.create<transform::TileUsingForOp>(
+              /* target */ tiledMatmulOp,
+              /* staticTileSizes */ reduceTileSize
+            );
+
+        // for k in K steps tileConfig[2]
+        auto forLoops = tiledKMatmulOp.getLoops();
+        // tiledmatmul computes at (BM, BN, tileConfig[2])
+        auto kMatmulOp = tiledKMatmulOp.getTiledLinalgOp();
+
+        if (!forLoops.empty()) {
+            b.create<transform::AnnotateOp>(forLoops[0], getMatmulKMainLoopMarker(),
+                                            Value());
+        } else {
+            b.create<transform::AnnotateOp>(kMatmulOp, getMatmulKMainLoopMarker(),
+                                            Value());
+        }
+
+        // Value mmaLevel = b.create<transform::ParamConstantOp>(
+        //     /* type */ transform::ParamType::get(b.getContext(), b.getStringAttr()),
+        //     /* value */ b.getStringAttr("Threadblock")
+        // );
+
+        // b.create<transform::AnnotateOp>(kMatmulOp, getLinalgMMALevelAttrName(),
+        //                                 mmaLevel);
+        b.create<transform::AnnotateOp>(kMatmulOp, getMMAPatternAttrName(),
+                                        Value());
+    };
+    pm.addPass(createGenericTransformInsertionPass(config));
+}
+} // namespace
+
+void mlir::buddy::createGemmTileTransform(OpPassManager &pm,
+                             const GPUGemmGeneralOptions &options) {
+    invokeOpPassPipelineBuilder(
+        createGemmTileTransformImpl, pm,
+        options.funcAnchor, options.annotatePrefix);
 }
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Pipelines/GPU/Utils.cpp
index 7298195995..67a6b61e70 100644
--- a/midend/lib/Pipelines/GPU/Utils.cpp
+++ b/midend/lib/Pipelines/GPU/Utils.cpp
@@ -3,9 +3,12 @@
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 
 #include "Pipelines/GPU/Utils.h"
 
+#include <optional>
+
 using namespace mlir;
 using namespace llvm;
 
@@ -48,5 +51,59 @@ bool isLinalgMatmul(Operation *op) {
     }
 }
 
+
+std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp) {
+    if (funcOp->hasAttr(getGemmTileMConfigAttrName()) &&
+        funcOp->hasAttr(getGemmTileNConfigAttrName()) &&
+        funcOp->hasAttr(getGemmTileKConfigAttrName())) {
+        auto tileConfigM = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmTileMConfigAttrName()).getInt();
+        auto tileConfigN = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmTileNConfigAttrName()).getInt();
+        auto tileConfigK = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmTileKConfigAttrName()).getInt();
+
+        llvm::SmallVector<int64_t, 3> configVec = {tileConfigM, tileConfigN, tileConfigK};
+
+        return configVec;
+    }
+    return std::nullopt;
+}
+
+std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp) {
+    if (funcOp->hasAttr(getGemmBlockXSizeAttrName()) &&
+        funcOp->hasAttr(getGemmBlockYSizeAttrName()) &&
+        funcOp->hasAttr(getGemmBlockZSizeAttrName())) {
+        auto blockSizeX = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmBlockXSizeAttrName()).getInt();
+        auto blockSizeY = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmBlockYSizeAttrName()).getInt();
+        auto blockSizeZ = funcOp->getAttrOfType<IntegerAttr>(
+                            getGemmBlockZSizeAttrName()).getInt();
+
+        llvm::SmallVector<int64_t, 3> blockSizeVec = {blockSizeX, blockSizeY, blockSizeZ};
+
+        return blockSizeVec;
+    }
+    return std::nullopt;
+}
+
+std::optional<int64_t> getGemmPipelineStages(func::FuncOp funcOp) {
+    if (funcOp->hasAttr(getGemmPipelineStageAttrName())) {
+        auto stages = funcOp->getAttrOfType<IntegerAttr>(
+                        getGemmPipelineStageAttrName()).getInt();
+        return stages;
+    }
+    return std::nullopt;
+}
+
+void setMarker(Operation *op, StringRef marker) {
+  op->setAttr(marker, UnitAttr::get(op->getContext()));
+}
+
+bool hasMarker(Operation *op, StringRef marker) {
+  return op->hasAttrOfType<UnitAttr>(marker);
+}
+
 } // namespace mlir::buddy
 } // namespace mlir
\ No newline at end of file
diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp
index a0bce01fd3..ff688eb3c1 100644
--- a/midend/lib/Pipelines/LinalgTensorOpt.cpp
+++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp
@@ -13,14 +13,20 @@ namespace {
 
 void addGPULinalgOptPasses(OpPassManager &pm) {
     { // Gemm Codegen Linalg Tensor Opt
+        // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK
         SmallVector<int64_t> tileConfig = {32, 32, 16};
+        // blockIdx.x y z
         SmallVector<int64_t> workGroup = {32, 2, 1};
         int64_t stages = 3;
-        mlir::buddy::GPUGemmCodegenConfigOptions configOption;
-        configOption.tileConfig = tileConfig;
-        configOption.workGroup = workGroup;
-        configOption.stages = stages;
-        createGPUGemmTileConfigInsertTransform(pm, configOption);
+        mlir::buddy::GPUGemmCodegenConfigOptions configOptions;
+        configOptions.tileConfig = tileConfig;
+        configOptions.workGroup = workGroup;
+        configOptions.stages = stages;
+        createGemmTileConfigInsertTransform(pm, configOptions);
+        pm.addPass(createTransformDialectInterpreter(true));
+
+        mlir::buddy::GPUGemmGeneralOptions generalOptions;
+        createGemmTileTransform(pm, generalOptions);
         pm.addPass(createTransformDialectInterpreter(true));
     }
 }

From 63ad6c961ee2f9e97102180af528479d6133c528 Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Sat, 14 Sep 2024 17:03:51 +0800
Subject: [PATCH 6/7] add linalg-promotion pass

---
 examples/BuddyMatmul/linalg-matmul.mlir       |  18 +-
 midend/include/Dialect/CMakeLists.txt         |   3 +-
 midend/include/Dialect/GPU/Passes.h           |   0
 midend/include/Dialect/GPU/Passes.td          |   3 +
 midend/include/Dialect/Linalg/CMakeLists.txt  |   3 +
 midend/include/Dialect/Linalg/Passes.h        |  15 ++
 midend/include/Dialect/Linalg/Passes.td       |   6 +
 .../Linalg/Transforms/LinalgPromotion.h       |  16 ++
 midend/include/Pipelines/BufferizeOpt.h       |  29 +++
 midend/include/Pipelines/LinalgMemrefOpt.h    |  29 +++
 .../GPU/Utils.h => Utils/GemmCodegenUtils.h}  |  68 ++++--
 midend/include/Utils/PipelineUtils.h          |  34 +++
 midend/lib/Dialect/CMakeLists.txt             |   1 +
 midend/lib/Dialect/Linalg/CMakeLists.txt      |   1 +
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |  11 +
 .../Linalg/Transforms/LinalgPromotion.cpp     | 205 ++++++++++++++++++
 .../Dialect/Linalg/Transforms/PassDetail.h    |  13 ++
 midend/lib/Pipelines/BufferizeOpt.cpp         |  35 +++
 midend/lib/Pipelines/CMakeLists.txt           |   6 +-
 midend/lib/Pipelines/GPU/CMakeLists.txt       |   2 +-
 .../Pipelines/GPU/GemmCodegenTransform.cpp    |   3 +-
 midend/lib/Pipelines/LinalgMemrefOpt.cpp      |  38 ++++
 midend/lib/Pipelines/LinalgTensorOpt.cpp      |  24 +-
 midend/lib/Utils/CMakeLists.txt               |  16 ++
 .../Utils.cpp => Utils/GemmCodegenUtils.cpp}  |  63 +++++-
 midend/lib/Utils/PipelineUtils.cpp            |  12 +
 tools/buddy-opt/CMakeLists.txt                |   1 +
 tools/buddy-opt/buddy-opt.cpp                 |   5 +-
 28 files changed, 612 insertions(+), 48 deletions(-)
 create mode 100644 midend/include/Dialect/GPU/Passes.h
 create mode 100644 midend/include/Dialect/GPU/Passes.td
 create mode 100644 midend/include/Dialect/Linalg/CMakeLists.txt
 create mode 100644 midend/include/Dialect/Linalg/Passes.h
 create mode 100644 midend/include/Dialect/Linalg/Passes.td
 create mode 100644 midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
 create mode 100644 midend/include/Pipelines/BufferizeOpt.h
 create mode 100644 midend/include/Pipelines/LinalgMemrefOpt.h
 rename midend/include/{Pipelines/GPU/Utils.h => Utils/GemmCodegenUtils.h} (54%)
 create mode 100644 midend/include/Utils/PipelineUtils.h
 create mode 100644 midend/lib/Dialect/Linalg/CMakeLists.txt
 create mode 100644 midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt
 create mode 100644 midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
 create mode 100644 midend/lib/Dialect/Linalg/Transforms/PassDetail.h
 create mode 100644 midend/lib/Pipelines/BufferizeOpt.cpp
 create mode 100644 midend/lib/Pipelines/LinalgMemrefOpt.cpp
 rename midend/lib/{Pipelines/GPU/Utils.cpp => Utils/GemmCodegenUtils.cpp} (69%)
 create mode 100644 midend/lib/Utils/PipelineUtils.cpp

diff --git a/examples/BuddyMatmul/linalg-matmul.mlir b/examples/BuddyMatmul/linalg-matmul.mlir
index 72a70d48aa..2690acb3b6 100644
--- a/examples/BuddyMatmul/linalg-matmul.mlir
+++ b/examples/BuddyMatmul/linalg-matmul.mlir
@@ -2,20 +2,20 @@
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module attributes {} {
-  func.func private @Matmul(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> attributes {} {
+  func.func private @Matmul(%arg0: tensor<1024x512xf16>, %arg1: tensor<512x1024xf16>) -> tensor<1024x1024xf16> attributes {} {
     %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1280x1280xf16>
-    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1280xf16>) -> tensor<1280x1280xf16>
-    %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1280x32xf16>, tensor<32x1280xf16>) outs(%1 : tensor<1280x1280xf16>) {
+    %0 = tensor.empty() : tensor<1024x1024xf16>
+    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1024x512xf16>, tensor<512x1024xf16>) outs(%1 : tensor<1024x1024xf16>) {
     ^bb0(%in: f16, %in_0: f16, %out: f16):
       %3 = arith.mulf %in, %in_0 : f16
       %4 = arith.addf %out, %3 : f16
       linalg.yield %4 : f16
-    } -> tensor<1280x1280xf16>
-    return %2 : tensor<1280x1280xf16>
+    } -> tensor<1024x1024xf16>
+    return %2 : tensor<1024x1024xf16>
   }
-  func.func @forward(%arg0: tensor<1280x32xf16>, %arg1: tensor<32x1280xf16>) -> tensor<1280x1280xf16> {
-    %0 = call @Matmul(%arg0, %arg1) : (tensor<1280x32xf16>, tensor<32x1280xf16>) -> tensor<1280x1280xf16>
-    return %0 : tensor<1280x1280xf16>
+  func.func @forward(%arg0: tensor<1024x512xf16>, %arg1: tensor<512x1024xf16>) -> tensor<1024x1024xf16> {
+    %0 = call @Matmul(%arg0, %arg1) : (tensor<1024x512xf16>, tensor<512x1024xf16>) -> tensor<1024x1024xf16>
+    return %0 : tensor<1024x1024xf16>
   }
 }
diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt
index 7b472d0d2d..9bae4c7796 100644
--- a/midend/include/Dialect/CMakeLists.txt
+++ b/midend/include/Dialect/CMakeLists.txt
@@ -5,4 +5,5 @@ add_subdirectory(RVV)
 add_subdirectory(VectorExp)
 add_subdirectory(Gemmini)
 add_subdirectory(Sche)
-add_subdirectory(Transform)
\ No newline at end of file
+add_subdirectory(Transform)
+add_subdirectory(Linalg)
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/Passes.h b/midend/include/Dialect/GPU/Passes.h
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/midend/include/Dialect/GPU/Passes.td b/midend/include/Dialect/GPU/Passes.td
new file mode 100644
index 0000000000..b247a5373f
--- /dev/null
+++ b/midend/include/Dialect/GPU/Passes.td
@@ -0,0 +1,3 @@
+include "mlir/Pass/PassBase.td"
+
+def 
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/CMakeLists.txt b/midend/include/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 0000000000..1ef1e9b24e
--- /dev/null
+++ b/midend/include/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(BuddyLinalgPassIncGen)
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/Passes.h b/midend/include/Dialect/Linalg/Passes.h
new file mode 100644
index 0000000000..a9982e5404
--- /dev/null
+++ b/midend/include/Dialect/Linalg/Passes.h
@@ -0,0 +1,15 @@
+#ifndef DIALECT_LINALG_PASSES_H
+#define DIALECT_LINALG_PASSES_H
+
+// Include the constructor of passes in Linalg Dialect
+#include "Linalg/Transforms/LinalgPromotion.h"
+
+namespace mlir {
+// Generate the definition of Linalg Passes
+#define GEN_PASS_DECL
+#include "Linalg/Passes.h.inc"
+
+#define GEN_PASS_REGISTRATION
+#include "Linalg/Passes.h.inc"
+
+} // namespace mlir
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/Passes.td b/midend/include/Dialect/Linalg/Passes.td
new file mode 100644
index 0000000000..d959b63284
--- /dev/null
+++ b/midend/include/Dialect/Linalg/Passes.td
@@ -0,0 +1,6 @@
+include "mlir/Pass/PassBase.td"
+
+def LinalgPromotion : Pass<"linalg-promotion", "func::FuncOp"> {
+    let summary = "promote Linalg's MatmulOp operand subview to memref.alloca and linalg.copy";
+    let constructor = "mlir::createLinalgPromotionPass()";
+}
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
new file mode 100644
index 0000000000..8405e24bb1
--- /dev/null
+++ b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
@@ -0,0 +1,16 @@
+#ifndef LINALG_TRANSFORMS_LINALGPROMOTION_H
+#define LINALG_TRANSFORMS_LINALGPROMOTION_H
+
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include <memory>
+
+namespace mlir {
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLinalgPromotionPass();
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/BufferizeOpt.h b/midend/include/Pipelines/BufferizeOpt.h
new file mode 100644
index 0000000000..a459863cf8
--- /dev/null
+++ b/midend/include/Pipelines/BufferizeOpt.h
@@ -0,0 +1,29 @@
+#ifndef PIPELINES_BUFFERIZEOPT_H
+#define PIPELINES_BUFFERIZEOPT_H
+
+#include "Pipelines/LinalgTensorOpt.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include <string>
+
+namespace mlir {
+namespace buddy {
+    
+struct BuddyBufferizeOptOptions :
+    public PassPipelineOptions<BuddyBufferizeOptOptions> {
+    Option<std::string> target {
+        *this, "target",
+        llvm::cl::desc("An option to specify target"),
+    };
+};
+
+void createBufferizeOptPipeline(OpPassManager &pm, const BuddyBufferizeOptOptions &options);
+
+void registerBufferizeOptPassPipeline();
+
+} // namespace mlir::buddy
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/LinalgMemrefOpt.h b/midend/include/Pipelines/LinalgMemrefOpt.h
new file mode 100644
index 0000000000..b61032efef
--- /dev/null
+++ b/midend/include/Pipelines/LinalgMemrefOpt.h
@@ -0,0 +1,29 @@
+#ifndef PIPELINES_MEMREFOPT_H
+#define PIPELINES_MEMREFOPT_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include <string>
+
+namespace mlir {
+namespace buddy {
+
+struct LinalgMemrefOptPipelineOptions :
+    public PassPipelineOptions<LinalgMemrefOptPipelineOptions> {
+    Option<std::string> target {
+        *this, "target",
+        llvm::cl::desc("An optional attribute to speicify target."),
+    };
+};
+
+void createLinalgMemrefOptPipeline(OpPassManager &pm, 
+                                   const LinalgMemrefOptPipelineOptions &options);
+
+void registerLinalgMemrefOptPipeline();
+
+} // mlir::buddy
+} // mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Pipelines/GPU/Utils.h b/midend/include/Utils/GemmCodegenUtils.h
similarity index 54%
rename from midend/include/Pipelines/GPU/Utils.h
rename to midend/include/Utils/GemmCodegenUtils.h
index 7821b53c31..69c4acae6c 100644
--- a/midend/include/Pipelines/GPU/Utils.h
+++ b/midend/include/Utils/GemmCodegenUtils.h
@@ -1,8 +1,12 @@
 #ifndef PIPELINES_GPU_UTILS_H
 #define PIPELINES_GPU_UTILS_H
 
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "llvm/ADT/StringRef.h"
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <optional>
 
 namespace mlir {
 class ModuleOp;
@@ -13,25 +17,15 @@ class FuncOp;
 
 namespace buddy {
 
-template <typename OpClass = ModuleOp, typename Builder, typename... Args>
-void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
-                                 Args &&...args) {
-  if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() &&
-      pm.getOpAnchorName() != OpClass::getOperationName()) {
-    if (pm.getNesting() == OpPassManager::Nesting::Implicit) {
-      builder(pm.nest<OpClass>(), std::forward<Args>(args)...);
-      return;
-    }
-    llvm::report_fatal_error(
-        llvm::Twine("Can't build pass pipeline on expected op type ") +
-        OpClass::getOperationName() + " but got " + pm.getOpAnchorName());
-  } else {
-    builder(pm, std::forward<Args>(args)...);
-  }
-}
-
 bool isLinalgMatmul(Operation *op);
 
+void setMarker(mlir::Operation *op, llvm::StringRef marker);
+
+bool hasMarker(Operation *op, StringRef marker);
+
+static constexpr StringRef getGemmMarkerAttrName() {
+    return "__buddy_gemm__";
+}
 
 static constexpr StringRef getGemmTileMConfigAttrName() {
     return "__buddy_gemm_tile_config__M";
@@ -65,12 +59,37 @@ static constexpr StringRef getMatmulKMainLoopMarker() {
     return "__buddy_gemm_main_loopk__";
 }
 
-constexpr StringRef getLinalgMMALevelAttrName() {
-  return "__buddy_mma_level__";
+static constexpr StringRef getLinalgMMALevelAttrName() {
+    return "__buddy_mma_level__";
 }
 
-constexpr StringRef getMMAPatternAttrName() { return "__buddy_mma__"; }
+static constexpr StringRef getMMAPatternAttrName() { 
+    return "__buddy_mma__"; 
+}
 
+static constexpr StringRef getAllocSharedMemoryAMarker() {
+    return "__buddy_smem_matrix_a__";
+};
+
+static constexpr StringRef getAllocSharedMemoryBMarker() {
+    return "__buddy_smem_matrix_b__";
+};
+
+static constexpr StringRef getAllocSharedMemoryAccMarker() {
+    return "__buddy_smem_accumulator__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryAMarker() {
+    return "__buddy_load_matrix_a__";
+};
+
+static constexpr StringRef getCopyToSharedMemoryBMarker() {
+    return "__buddy_load_matrix_b__";
+};
+
+static constexpr StringRef getCopyFromSharedMemoryAccMarker() {
+    return "__buddy_store_matrix_c__";
+};
 
 std::optional<SmallVector<int64_t, 3>> getGemmTileSize(func::FuncOp funcOp);
 
@@ -78,12 +97,13 @@ std::optional<SmallVector<int64_t, 3>> getGemmBlockSize(func::FuncOp funcOp);
 
 std::optional<int64_t> getGemmPipelineStages(func::FuncOp funcOp);
 
-void setMarker(mlir::Operation *op, llvm::StringRef marker);
+bool funcHasGemm(func::FuncOp funcOp);
 
-bool hasMarker(Operation *op, StringRef marker);
+bool isMappedToGPUBlock(scf::ForallOp forallOp);
 
+std::optional<scf::ForallOp> getForallOpMappedToBlock(func::FuncOp funcOp);
 
-} // namespace buddy::pipelines
-} // namespace buddy
+} // namespace mlir::buddy
+} // namespace mlir
 
 #endif
\ No newline at end of file
diff --git a/midend/include/Utils/PipelineUtils.h b/midend/include/Utils/PipelineUtils.h
new file mode 100644
index 0000000000..215810c40c
--- /dev/null
+++ b/midend/include/Utils/PipelineUtils.h
@@ -0,0 +1,34 @@
+#ifndef UTILS_PIPELINEUTILS_H
+#define UTILS_PIPELINEUTILS_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+class ModuleOp;
+namespace buddy {
+
+template <typename OpClass = ModuleOp, typename Builder, typename... Args>
+void invokeOpPassPipelineBuilder(Builder builder, OpPassManager &pm,
+                                 Args &&...args) {
+  if (pm.getOpAnchorName() != OpPassManager::getAnyOpAnchorName() &&
+      pm.getOpAnchorName() != OpClass::getOperationName()) {
+    if (pm.getNesting() == OpPassManager::Nesting::Implicit) {
+      builder(pm.nest<OpClass>(), std::forward<Args>(args)...);
+      return;
+    }
+    llvm::report_fatal_error(
+        llvm::Twine("Can't build pass pipeline on expected op type ") +
+        OpClass::getOperationName() + " but got " + pm.getOpAnchorName());
+  } else {
+    builder(pm, std::forward<Args>(args)...);
+  }
+}
+
+void addCleanUpPassPipeline(OpPassManager &pm, bool isModuleOp = true);
+
+} // namespace mlir::buddy
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt
index 304de8570c..f0e2ac6c13 100644
--- a/midend/lib/Dialect/CMakeLists.txt
+++ b/midend/lib/Dialect/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(VectorExp)
 add_subdirectory(Gemmini)
 add_subdirectory(Sche)
 add_subdirectory(Transform)
+add_subdirectory(Linalg)
diff --git a/midend/lib/Dialect/Linalg/CMakeLists.txt b/midend/lib/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 0000000000..5c919f7dfc
--- /dev/null
+++ b/midend/lib/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000..8a69b82a09
--- /dev/null
+++ b/midend/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_dialect_library(BuddyLinalgPasses
+  LinalgPromotion.cpp
+
+  DEPENDS
+  BuddyLinalgPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+  MLIRTransformDialect
+)
diff --git a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
new file mode 100644
index 0000000000..31f4225e43
--- /dev/null
+++ b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
@@ -0,0 +1,205 @@
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Support/LLVM.h"
+
+#include "Linalg/Transforms/LinalgPromotion.h"
+#include "PassDetail.h"
+#include "Utils/GemmCodegenUtils.h"
+
+#include <cstdint>
+#include <memory>
+#include <mlir/Dialect/Linalg/IR/LinalgInterfaces.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LogicalResult.h>
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::buddy;
+
+namespace {
+
+constexpr int64_t MatmulOperand_A = 0;
+constexpr int64_t MatmulOperand_B = 1;
+constexpr int64_t MatmulOperand_C = 2;
+
+constexpr StringRef allocMarker[3] = {getAllocSharedMemoryAMarker(),
+                                      getAllocSharedMemoryBMarker(),
+                                      getAllocSharedMemoryAccMarker()};
+
+constexpr StringRef copyMarker[3] = {getCopyToSharedMemoryAMarker(),
+                                     getCopyToSharedMemoryBMarker(),
+                                     getCopyFromSharedMemoryAccMarker()};
+
+template<int OPERAND>
+std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview,
+                        ArrayRef<Value> boundingSubViewSize, DataLayout &layout) {
+
+    OpBuilder::InsertionGuard guard(builder);
+
+    scf::ForallOp forallOp = subview->getParentOfType<scf::ForallOp>();
+    if (!forallOp) {
+        return std::nullopt;
+    }
+
+    SmallVector<int64_t, 2> shapes;
+    for (Value bound : boundingSubViewSize) {
+        APInt value;
+        if (!matchPattern(bound, m_ConstantInt(&value))) {
+            return std::nullopt;
+        }
+        shapes.push_back(value.getSExtValue());
+    }
+
+    builder.setInsertionPointToStart(forallOp.getBody());
+
+    auto smemBufferType = MemRefType::get(shapes, subview.getType().getElementType(),
+                                        MemRefLayoutAttrInterface{},
+                                        gpu::AddressSpaceAttr::get(builder.getContext(),
+                                            gpu::GPUDialect::getWorkgroupAddressSpace()));
+    
+    memref::AllocOp smemBuffer = builder.create<memref::AllocOp>(forallOp.getLoc(), smemBufferType);
+    setMarker(smemBuffer, allocMarker[OPERAND]);
+
+    return smemBuffer;
+
+}
+
+// nvgpu device mem no need to manually deallocate
+LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/) {
+    return success();
+}
+
+template <int OPERAND>
+LogicalResult copyGlobalMemoryToWorkgroupMemory(OpBuilder &b, Value src,
+                                                Value dst) {
+    if (OPERAND == MatmulOperand_C) {
+        return success();
+    }
+    auto copyOp = b.create<linalg::CopyOp>(src.getLoc(), src, dst);
+    setMarker(copyOp, copyMarker[OPERAND]);
+    return success();
+}
+
+template <int OPERAND>
+LogicalResult copyWorkgroupMemoryToGlobalMemory(OpBuilder &b, Value src,
+                                                Value dst) {
+    OpBuilder::InsertionGuard guard(b);
+    if (OPERAND == MatmulOperand_A || 
+        OPERAND == MatmulOperand_B) {
+        return success();        
+    }
+    auto op = src.getDefiningOp();
+
+    // Because MatmulC Operand is out of scf::ForOp so we need to get ForallOp first
+    scf::ForallOp forallOp = op->getParentOfType<scf::ForallOp>();
+    auto forOps = llvm::to_vector(forallOp.getOps<scf::ForOp>());
+    if (forOps.size() == 1) {
+        // set copymem op insertion point after compute block 
+        b.setInsertionPointAfter(forOps[0]);
+    }
+    if (forOps.size() > 1) {
+        return failure();
+    }
+    b.create<gpu::BarrierOp>(src.getLoc());
+    linalg::CopyOp copyOp = b.create<linalg::CopyOp>(src.getLoc(), src, dst);
+    setMarker(copyOp, copyMarker[MatmulOperand_C]);
+    return success();
+}
+
+
+template <int OPERAND>
+static linalg::LinalgPromotionOptions getPromotionOptionsForMatmulOperand() {
+  linalg::LinalgPromotionOptions promotionOptions;
+  promotionOptions
+      .setAllocationDeallocationFns(allocateWorkgroupMemory<OPERAND>,
+                                    deallocateWorkgroupMemory)
+      .setCopyInOutFns(copyGlobalMemoryToWorkgroupMemory<OPERAND>,
+                       copyWorkgroupMemoryToGlobalMemory<OPERAND>)
+      .setOperandsToPromote({OPERAND})
+      .setUseFullTileBuffers({false, false});
+  return promotionOptions;
+}
+
+template <int OPERAND>
+static LogicalResult promotionImpl(OpBuilder &builder, Operation *op) {
+    linalg::LinalgPromotionOptions promotionOptions = 
+        getPromotionOptionsForMatmulOperand<OPERAND>();
+
+    if (failed(linalg::promoteSubviewsPrecondition(op, promotionOptions))) {
+        return failure();
+    }
+
+    std::optional<linalg::LinalgOp> promotedLinalgOp =
+        linalg::promoteSubViews(builder, cast<linalg::LinalgOp>(op), promotionOptions);
+    
+    if (!promotedLinalgOp) {
+        return op->emitError("subview promotion failed!");
+    }
+    return success();
+}
+
+struct LinalgPromotionPass : public LinalgPromotionBase<LinalgPromotionPass> {
+public:
+    LinalgPromotionPass() = default;
+
+    void runOnOperation() override {
+        // the whole promotion pipeline is 
+        // split m, n, promote C, split k, promote A & B
+        func::FuncOp funcOp = getOperation();
+        SmallVector<linalg::LinalgOp> LinalgOpsToPromote;
+
+        if (!funcHasGemm(funcOp)) {
+            return;
+        }
+
+        if (!getForallOpMappedToBlock(funcOp)) {
+            return;
+        }
+        scf::ForallOp forallOp = getForallOpMappedToBlock(funcOp).value();
+
+        forallOp->walk([&](linalg::LinalgOp linalgOp) {
+            if (isLinalgMatmul(linalgOp)) {
+                LinalgOpsToPromote.push_back(linalgOp);
+            }
+        });
+        if (LinalgOpsToPromote.empty()) {
+            return;
+        }
+        assert(LinalgOpsToPromote.size() == 1);
+        auto linalgContractOp = LinalgOpsToPromote[0];
+
+        // set Builder insertion point before the linalgContractOp
+        OpBuilder b(linalgContractOp);
+        promotionImpl<MatmulOperand_A>(b, linalgContractOp);
+        promotionImpl<MatmulOperand_B>(b, linalgContractOp);
+
+        // set the insertion before forop to alloc MatrixC
+        scf::ForOp forOp = linalgContractOp->getParentOfType<scf::ForOp>();
+        if (!forOp) {
+            b.setInsertionPoint(linalgContractOp);
+        } else {
+            b.setInsertionPoint(forOp);
+        }
+
+        promotionImpl<MatmulOperand_C>(b, linalgContractOp);
+        
+
+    }
+};
+} // namespace mlir
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::createLinalgPromotionPass() {
+    return std::make_unique<LinalgPromotionPass>();
+}
diff --git a/midend/lib/Dialect/Linalg/Transforms/PassDetail.h b/midend/lib/Dialect/Linalg/Transforms/PassDetail.h
new file mode 100644
index 0000000000..9b5555267d
--- /dev/null
+++ b/midend/lib/Dialect/Linalg/Transforms/PassDetail.h
@@ -0,0 +1,13 @@
+#ifndef DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H
+#define DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+#define GEN_PASS_CLASSES
+#include "Linalg/Passes.h.inc"
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/Pipelines/BufferizeOpt.cpp b/midend/lib/Pipelines/BufferizeOpt.cpp
new file mode 100644
index 0000000000..b7abdb3680
--- /dev/null
+++ b/midend/lib/Pipelines/BufferizeOpt.cpp
@@ -0,0 +1,35 @@
+#include "Pipelines/BufferizeOpt.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
+
+using namespace mlir;
+
+void mlir::buddy::createBufferizeOptPipeline(OpPassManager &pm, 
+                                             const BuddyBufferizeOptOptions &options) {
+    mlir::buddy::invokeOpPassPipelineBuilder(
+        [&](OpPassManager &pm) {
+            // OneShotBufferization not implement bufferize on funcOp's arguments on default
+            bufferization::OneShotBufferizationOptions bufferizeOptions;
+            bufferizeOptions.bufferizeFunctionBoundaries = true;
+            // bufferizeOptions.allowReturnAllocsFromLoops
+            pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorEliminationPass());
+            pm.addPass(bufferization::createOneShotBufferizePass(bufferizeOptions));
+            pm.addNestedPass<func::FuncOp>(memref::createFoldMemRefAliasOpsPass());
+            addCleanUpPassPipeline(pm);
+        }, pm);
+}
+
+void mlir::buddy::registerBufferizeOptPassPipeline() {
+    PassPipelineRegistration<BuddyBufferizeOptOptions>(
+        "bufferize-opt",
+        "bufferize opt lowering tensor to memref",
+        createBufferizeOptPipeline
+    );
+}
\ No newline at end of file
diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt
index ed97f0e647..59cd289a1c 100644
--- a/midend/lib/Pipelines/CMakeLists.txt
+++ b/midend/lib/Pipelines/CMakeLists.txt
@@ -2,9 +2,13 @@ add_subdirectory(GPU)
 
 add_mlir_library(BuddyPipelines
     LinalgTensorOpt.cpp
-    GPU/Utils.cpp
+    BufferizeOpt.cpp
+    LinalgMemrefOpt.cpp
+    
     LINK_LIBS PUBLIC
     MLIRIR
     BuddyGPUPipelines
     BuddyTransformPasses
+    BuddyGemmCodegenUtils
+    BuddyPipelineUtils
 )
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/CMakeLists.txt b/midend/lib/Pipelines/GPU/CMakeLists.txt
index 1f5a151312..5ca25dd744 100644
--- a/midend/lib/Pipelines/GPU/CMakeLists.txt
+++ b/midend/lib/Pipelines/GPU/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_library(BuddyGPUPipelines
   GemmCodegenTransform.cpp
-  Utils.cpp
 
   LINK_LIBS PUBLIC
   MLIRIR
@@ -8,4 +7,5 @@ add_mlir_library(BuddyGPUPipelines
   MLIRTransformDialect
   MLIRTransforms
   BuddyTransformPasses
+  BuddyGemmCodegenUtils
 )
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
index 0b3c8f3e4f..b4e599da09 100644
--- a/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
+++ b/midend/lib/Pipelines/GPU/GemmCodegenTransform.cpp
@@ -1,6 +1,7 @@
 #include "Transform/Transforms/TransformInsertion.h"
 #include "Pipelines/GPU/GemmCodegenTransform.h"
-#include "Pipelines/GPU/Utils.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
 
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/PDL/IR/PDLOps.h"
diff --git a/midend/lib/Pipelines/LinalgMemrefOpt.cpp b/midend/lib/Pipelines/LinalgMemrefOpt.cpp
new file mode 100644
index 0000000000..a7d5538162
--- /dev/null
+++ b/midend/lib/Pipelines/LinalgMemrefOpt.cpp
@@ -0,0 +1,38 @@
+#include "Pipelines/LinalgMemrefOpt.h"
+#include "Linalg/Transforms/LinalgPromotion.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "Utils/PipelineUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include <string>
+
+using namespace mlir;
+
+namespace {
+
+void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) {
+    // TODO : use funcAnchor to nest the specific matmul func
+    pm.addNestedPass<func::FuncOp>(createLinalgPromotionPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCSEPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+}
+
+void createLinalgMemrefOptPipelineImpl(OpPassManager &pm, 
+                                       const std::string target) {
+    addGemmLinalgMemrefOptPipeline(pm);
+}
+
+}
+
+void mlir::buddy::createLinalgMemrefOptPipeline(OpPassManager &pm, 
+                                                const LinalgMemrefOptPipelineOptions &options) {
+    invokeOpPassPipelineBuilder(createLinalgMemrefOptPipelineImpl, pm, options.target);
+}
+
+void mlir::buddy::registerLinalgMemrefOptPipeline() {
+    PassPipelineRegistration<LinalgMemrefOptPipelineOptions>(
+        "linalg-memref-opt", "Linalg Opt Pipeline with Memref",
+        createLinalgMemrefOptPipeline);
+}
\ No newline at end of file
diff --git a/midend/lib/Pipelines/LinalgTensorOpt.cpp b/midend/lib/Pipelines/LinalgTensorOpt.cpp
index ff688eb3c1..8ee373fda2 100644
--- a/midend/lib/Pipelines/LinalgTensorOpt.cpp
+++ b/midend/lib/Pipelines/LinalgTensorOpt.cpp
@@ -1,11 +1,15 @@
 #include "Pipelines/LinalgTensorOpt.h"
-#include "Pipelines/GPU/Utils.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
 #include "Pipelines/GPU/GemmCodegenTransform.h"
 
 #include "Transform/Transforms/TransformDialectInterpreter.h"
 #include "Transform/Transforms/TransformInsertion.h"
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
@@ -13,10 +17,16 @@ namespace {
 
 void addGPULinalgOptPasses(OpPassManager &pm) {
     { // Gemm Codegen Linalg Tensor Opt
+        // TODO : to mark the func that has gemm linalg op 
+        // now the below option's funcanchor is set to empty 
+        // which considers that all func has matmul op
+        auto funcGemmAnchor = mlir::buddy::getGemmMarkerAttrName().str();
         // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK
-        SmallVector<int64_t> tileConfig = {32, 32, 16};
-        // blockIdx.x y z
-        SmallVector<int64_t> workGroup = {32, 2, 1};
+        // blockIdx.y = M / BM
+        // blockIdx.x = N / BN
+        SmallVector<int64_t> tileConfig = {128, 128, 64};
+        // threadIdx.x y z
+        SmallVector<int64_t> workGroup = {32, 4, 1};
         int64_t stages = 3;
         mlir::buddy::GPUGemmCodegenConfigOptions configOptions;
         configOptions.tileConfig = tileConfig;
@@ -28,6 +38,8 @@ void addGPULinalgOptPasses(OpPassManager &pm) {
         mlir::buddy::GPUGemmGeneralOptions generalOptions;
         createGemmTileTransform(pm, generalOptions);
         pm.addPass(createTransformDialectInterpreter(true));
+        pm.addPass(createCanonicalizerPass());
+        pm.addPass(createCSEPass());
     }
 }
 
@@ -50,5 +62,7 @@ void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm,
 
 void mlir::buddy::registerLinalgTensorOptPassPipeline() {
     PassPipelineRegistration<LinalgTensorOptPipelineOptions>(
-        "linalg-tensor-opt", "Linalg with Tensor Opt Pass Pipeline", mlir::buddy::createLinalgTensorOptPassPipeline);
+        "linalg-tensor-opt", 
+        "Linalg with Tensor Opt Pass Pipeline", 
+        mlir::buddy::createLinalgTensorOptPassPipeline);
 }
diff --git a/midend/lib/Utils/CMakeLists.txt b/midend/lib/Utils/CMakeLists.txt
index ff9aa6e380..f9101f1b64 100644
--- a/midend/lib/Utils/CMakeLists.txt
+++ b/midend/lib/Utils/CMakeLists.txt
@@ -3,6 +3,8 @@ add_mlir_library(BuddyUtils
   DIPUtils.cpp
   DAPUtils.cpp
   AffineTransformUtils.cpp
+  GemmCodegenUtils.cpp
+  PipelineUtils.cpp
   )
 
 add_mlir_library(BuddyDIPUtils
@@ -18,3 +20,17 @@ add_mlir_library(BuddyDAPUtils
   LINK_LIBS PUBLIC
   BuddyUtils
   )
+
+add_mlir_library(BuddyGemmCodegenUtils
+  GemmCodegenUtils.cpp
+  
+  LINK_LIBS PUBLIC
+  BuddyUtils
+)
+
+add_mlir_library(BuddyPipelineUtils
+  PipelineUtils.cpp
+  
+  LINK_LIBS PUBLIC
+  BuddyUtils
+)
\ No newline at end of file
diff --git a/midend/lib/Pipelines/GPU/Utils.cpp b/midend/lib/Utils/GemmCodegenUtils.cpp
similarity index 69%
rename from midend/lib/Pipelines/GPU/Utils.cpp
rename to midend/lib/Utils/GemmCodegenUtils.cpp
index 67a6b61e70..8b087846d3 100644
--- a/midend/lib/Pipelines/GPU/Utils.cpp
+++ b/midend/lib/Utils/GemmCodegenUtils.cpp
@@ -2,10 +2,17 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 
-#include "Pipelines/GPU/Utils.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <optional>
 
@@ -15,6 +22,14 @@ using namespace llvm;
 namespace mlir {
 namespace buddy {
 
+void setMarker(Operation *op, StringRef marker) {
+  op->setAttr(marker, UnitAttr::get(op->getContext()));
+}
+
+bool hasMarker(Operation *op, StringRef marker) {
+  return op->hasAttrOfType<UnitAttr>(marker);
+}
+
 bool isLinalgMatmul(Operation *op) {
     if (!llvm::isa<linalg::LinalgOp>(op)) {
         return false;
@@ -97,12 +112,50 @@ std::optional<int64_t> getGemmPipelineStages(func::FuncOp funcOp) {
     return std::nullopt;
 }
 
-void setMarker(Operation *op, StringRef marker) {
-  op->setAttr(marker, UnitAttr::get(op->getContext()));
+bool funcHasGemm(func::FuncOp funcOp) {
+    // TODO
+    return true;
 }
 
-bool hasMarker(Operation *op, StringRef marker) {
-  return op->hasAttrOfType<UnitAttr>(marker);
+bool isMappedToGPUBlock(scf::ForallOp forallOp) {
+
+    SmallVector<int64_t> mappingIdx{2, 1, 0}; 
+    MLIRContext ctx;
+    ctx.loadDialect<gpu::GPUDialect>();
+    auto mapping = llvm::to_vector(llvm::map_range(
+            mappingIdx, 
+            [](int64_t i){return static_cast<gpu::MappingId>(i);}));
+    auto mappingAttrs = llvm::to_vector(llvm::map_range(
+            mapping,
+            [&](gpu::MappingId dim) -> Attribute {
+                return gpu::GPUBlockMappingAttr::get(&ctx, dim);}));
+    ArrayAttr getMappingAttrs = forallOp->getAttrOfType<ArrayAttr>("mapping");
+
+    if (!getMappingAttrs) {
+        return false;
+    } else {
+        for (auto mappingAttr : getMappingAttrs) {
+            if (mappingAttr.isa<gpu::GPUBlockMappingAttr>()) {
+                return true;
+            }
+        }
+        return false;
+    }
+}
+
+std::optional<scf::ForallOp> getForallOpMappedToBlock(func::FuncOp funcOp) {
+    SmallVector<scf::ForallOp> forallOps;
+    funcOp->walk([&](scf::ForallOp forallOp){
+        if (isMappedToGPUBlock(forallOp)) {
+            forallOps.push_back(forallOp);
+        }
+    });
+    // one func one kernel -> one func only have one forallOp mapped to block
+    if (forallOps.size() != 1) {
+        llvm::errs() << "this funcOp has no ForallOp MappedToBlock\n";
+        return std::nullopt; 
+    }
+    return forallOps[0];
 }
 
 } // namespace mlir::buddy
diff --git a/midend/lib/Utils/PipelineUtils.cpp b/midend/lib/Utils/PipelineUtils.cpp
new file mode 100644
index 0000000000..34c4600e38
--- /dev/null
+++ b/midend/lib/Utils/PipelineUtils.cpp
@@ -0,0 +1,12 @@
+#include "Utils/PipelineUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+void mlir::buddy::addCleanUpPassPipeline(OpPassManager &pm, bool isModuleOp) {
+    pm.addPass(createCSEPass());
+    pm.addPass(createCanonicalizerPass());
+    if (isModuleOp) {
+        pm.addPass(createSymbolDCEPass());
+    }
+}
\ No newline at end of file
diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt
index 7a87da7f24..1b448824d3 100644
--- a/tools/buddy-opt/CMakeLists.txt
+++ b/tools/buddy-opt/CMakeLists.txt
@@ -40,4 +40,5 @@ target_link_libraries(buddy-opt
   BuddyPipelines
   BuddyGPUPipelines
   BuddyTransformPasses
+  BuddyLinalgPasses
   )
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp
index fc96aa8bae..178532da11 100644
--- a/tools/buddy-opt/buddy-opt.cpp
+++ b/tools/buddy-opt/buddy-opt.cpp
@@ -47,7 +47,6 @@
 #include "Gemmini/GemminiOps.h"
 #include "Sche/ScheDialect.h"
 #include "Sche/ScheOps.h"
-#include "Pipelines/LinalgTensorOpt.h"
 
 namespace mlir {
 namespace buddy {
@@ -73,6 +72,8 @@ void registerDeviceSchedulePass();
 void registerLowerSchePass();
 void registerFuncBufferizeDynamicOffsetPass();
 void registerLinalgTensorOptPassPipeline();
+void registerBufferizeOptPassPipeline();
+void registerLinalgMemrefOptPipeline();
 } // namespace buddy
 } // namespace mlir
 
@@ -108,6 +109,8 @@ int main(int argc, char **argv) {
 
   // Register Pipeline Passes
   mlir::buddy::registerLinalgTensorOptPassPipeline();
+  mlir::buddy::registerLinalgMemrefOptPipeline();
+  mlir::buddy::registerBufferizeOptPassPipeline();
 
   mlir::DialectRegistry registry;
   // Register all MLIR core dialects.

From 46244c0e2bf6cdca92df4398cbb02bbe080a0652 Mon Sep 17 00:00:00 2001
From: CJ77Qi <chenjingqi@buaa.edu.cn>
Date: Fri, 20 Sep 2024 15:34:46 +0800
Subject: [PATCH 7/7] add GPUDistributeToWarpPass & RemoveRedundantPass in GPU
 GEMM Codegen Pipeline

---
 midend/include/Dialect/CMakeLists.txt         |   3 +-
 midend/include/Dialect/GPU/CMakeLists.txt     |   3 +
 midend/include/Dialect/GPU/Passes.h           |  18 +
 midend/include/Dialect/GPU/Passes.td          |  10 +-
 .../GPU/Transforms/GPUDistributeToWarp.h      |  15 +
 .../GPU/Transforms/RemoveReduntantLoops.h     |  16 +
 midend/include/Dialect/Linalg/Passes.h        |   4 +-
 .../Linalg/Transforms/LinalgPromotion.h       |   4 +-
 midend/include/Dialect/Transform/Passes.h     |   7 +-
 midend/lib/Dialect/CMakeLists.txt             |   1 +
 midend/lib/Dialect/GPU/CMakeLists.txt         |   1 +
 .../lib/Dialect/GPU/Transforms/CMakeLists.txt |  11 +
 .../GPU/Transforms/GPUDistributeToWarp.cpp    | 233 +++++++++++++
 .../lib/Dialect/GPU/Transforms/PassDetail.h   |  13 +
 .../GPU/Transforms/RemoveReduntantLoops.cpp   | 313 ++++++++++++++++++
 .../Linalg/Transforms/LinalgPromotion.cpp     |   7 +-
 midend/lib/Pipelines/CMakeLists.txt           |   1 +
 midend/lib/Pipelines/LinalgMemrefOpt.cpp      |   4 +
 18 files changed, 656 insertions(+), 8 deletions(-)
 create mode 100644 midend/include/Dialect/GPU/CMakeLists.txt
 create mode 100644 midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h
 create mode 100644 midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h
 create mode 100644 midend/lib/Dialect/GPU/Transforms/CMakeLists.txt
 create mode 100644 midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp
 create mode 100644 midend/lib/Dialect/GPU/Transforms/PassDetail.h
 create mode 100644 midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp

diff --git a/midend/include/Dialect/CMakeLists.txt b/midend/include/Dialect/CMakeLists.txt
index 9bae4c7796..2f98ccaba0 100644
--- a/midend/include/Dialect/CMakeLists.txt
+++ b/midend/include/Dialect/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(VectorExp)
 add_subdirectory(Gemmini)
 add_subdirectory(Sche)
 add_subdirectory(Transform)
-add_subdirectory(Linalg)
\ No newline at end of file
+add_subdirectory(Linalg)
+add_subdirectory(GPU)
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/CMakeLists.txt b/midend/include/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 0000000000..d268af3b42
--- /dev/null
+++ b/midend/include/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+add_public_tablegen_target(BuddyGPUPassIncGen)
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/Passes.h b/midend/include/Dialect/GPU/Passes.h
index e69de29bb2..f7dcca88e6 100644
--- a/midend/include/Dialect/GPU/Passes.h
+++ b/midend/include/Dialect/GPU/Passes.h
@@ -0,0 +1,18 @@
+#ifndef DIALECT_GPU_PASSES_H
+#define DIALECT_GPU_PASSES_H
+
+// Include the constructor of passes in GPU Dialect
+#include "GPU/Transforms/GPUDistributeToWarp.h" 
+#include "GPU/Transforms/RemoveReduntantLoops.h"
+
+namespace mlir {
+// Generate the definition of GPU Passes
+#define GEN_PASS_DECL
+#include "GPU/Passes.h.inc"
+
+#define GEN_PASS_REGISTRATION
+#include "GPU/Passes.h.inc"
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/Passes.td b/midend/include/Dialect/GPU/Passes.td
index b247a5373f..5c85dcbfd5 100644
--- a/midend/include/Dialect/GPU/Passes.td
+++ b/midend/include/Dialect/GPU/Passes.td
@@ -1,3 +1,11 @@
 include "mlir/Pass/PassBase.td"
 
-def 
\ No newline at end of file
+def GPUDistributeToWarp : Pass<"gpu-distribute-to-warp", "func::FuncOp"> {
+    let summary = "GPU distribute from Block level to Warp";
+    let constructor = "mlir::createGPUDistributeToWarpPass()";
+}
+
+def RemoveReduntantLoops : Pass<"remove-reduntant-loops", "func::FuncOp"> {
+    let summary = "Remove the loops that only run once in gpu kernel";
+    let constructor = "mlir::createRemoveReduntantLoops()";
+}
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h b/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h
new file mode 100644
index 0000000000..c48622d822
--- /dev/null
+++ b/midend/include/Dialect/GPU/Transforms/GPUDistributeToWarp.h
@@ -0,0 +1,15 @@
+#ifndef DIALECT_GPU_TRANSFORMS_GPUDistributeToWarp_H
+#define DIALECT_GPU_TRANSFORMS_GPUDistributeToWarp_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+
+std::unique_ptr<OperationPass<func::FuncOp>> createGPUDistributeToWarpPass();
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h b/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h
new file mode 100644
index 0000000000..7489038db8
--- /dev/null
+++ b/midend/include/Dialect/GPU/Transforms/RemoveReduntantLoops.h
@@ -0,0 +1,16 @@
+#ifndef DIALECT_GPU_TRANSFORMS_REMOVEREDUNTANTLOOPS_H
+#define DIALECT_GPU_TRANSFORMS_REMOVEREDUNTANTLOOPS_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace mlir {
+
+std::unique_ptr<OperationPass<func::FuncOp>> createRemoveReduntantLoops();
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/Passes.h b/midend/include/Dialect/Linalg/Passes.h
index a9982e5404..d4136bde9a 100644
--- a/midend/include/Dialect/Linalg/Passes.h
+++ b/midend/include/Dialect/Linalg/Passes.h
@@ -12,4 +12,6 @@ namespace mlir {
 #define GEN_PASS_REGISTRATION
 #include "Linalg/Passes.h.inc"
 
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
index 8405e24bb1..30f5b35e66 100644
--- a/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
+++ b/midend/include/Dialect/Linalg/Transforms/LinalgPromotion.h
@@ -1,5 +1,5 @@
-#ifndef LINALG_TRANSFORMS_LINALGPROMOTION_H
-#define LINALG_TRANSFORMS_LINALGPROMOTION_H
+#ifndef DIALECT_LINALG_TRANSFORMS_LINALGPROMOTION_H
+#define DIALECT_LINALG_TRANSFORMS_LINALGPROMOTION_H
 
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/midend/include/Dialect/Transform/Passes.h b/midend/include/Dialect/Transform/Passes.h
index ae7bc8b394..99c0796b07 100644
--- a/midend/include/Dialect/Transform/Passes.h
+++ b/midend/include/Dialect/Transform/Passes.h
@@ -1,3 +1,6 @@
+#ifndef DIALECT_TRANSFORM_PASSES_H
+#define DIALECT_TRANSFORM_PASSES_H
+
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 
@@ -14,4 +17,6 @@ class ModuleOp;
 #define GEN_PASS_REGISTRATION
 #include "Transform/Passes.h.inc"
 
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/Dialect/CMakeLists.txt b/midend/lib/Dialect/CMakeLists.txt
index f0e2ac6c13..2f98ccaba0 100644
--- a/midend/lib/Dialect/CMakeLists.txt
+++ b/midend/lib/Dialect/CMakeLists.txt
@@ -7,3 +7,4 @@ add_subdirectory(Gemmini)
 add_subdirectory(Sche)
 add_subdirectory(Transform)
 add_subdirectory(Linalg)
+add_subdirectory(GPU)
\ No newline at end of file
diff --git a/midend/lib/Dialect/GPU/CMakeLists.txt b/midend/lib/Dialect/GPU/CMakeLists.txt
index e69de29bb2..5c919f7dfc 100644
--- a/midend/lib/Dialect/GPU/CMakeLists.txt
+++ b/midend/lib/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt b/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000..58e45c61c1
--- /dev/null
+++ b/midend/lib/Dialect/GPU/Transforms/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_dialect_library(BuddyGPUPasses
+  GPUDistributeToWarp.cpp
+  RemoveReduntantLoops.cpp
+
+  DEPENDS
+  BuddyGPUPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+)
diff --git a/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp b/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp
new file mode 100644
index 0000000000..0a50cad71b
--- /dev/null
+++ b/midend/lib/Dialect/GPU/Transforms/GPUDistributeToWarp.cpp
@@ -0,0 +1,233 @@
+#include "GPU/Transforms/GPUDistributeToWarp.h"
+#include "PassDetail.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::buddy;
+
+namespace {
+
+static constexpr int64_t warpSize = 32;
+
+// get the parallel dim of linalg loops
+// if parallel dim = 1 remove it
+SmallVector<unsigned>
+getPartitionedLoopsFromLinalgOp(linalg::LinalgOp linalgOp) {
+  SmallVector<unsigned> parallelLoops;
+  // Return the dims that are parallel loops.
+  linalgOp.getParallelDims(parallelLoops);
+
+  // LoopRange is the size of each parallel's dim
+  SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
+  // remove the dimension in parallelLoops whose range is 1
+  parallelLoops = llvm::to_vector(
+      llvm::make_filter_range(parallelLoops, [=](unsigned loopDim) {
+        return loopRanges[loopDim] != 1;
+      }));
+  return parallelLoops;
+}
+
+// LinalgTilingOptions need this function to compute tile size
+// Set this func to LinalgTilingOptions.setTileSizeComputationFunction
+// op is LinalgOp
+// warp level tile only tile DIMM and DIMN
+std::optional<SmallVector<Value>>
+warpTileSizeComputationFunc(SmallVector<int64_t, 3> warpWorkGroup,
+                            OpBuilder &builder, Operation *op) {
+  func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
+  // get Block Level tileConfig from the annotation in funcOp's attribute
+  std::optional<SmallVector<int64_t, 3>> optionalGemmTileSize =
+      getGemmTileSize(funcOp);
+  if (!optionalGemmTileSize.has_value()) {
+    return std::nullopt;
+  }
+  SmallVector<int64_t, 3> gemmTileSize = optionalGemmTileSize.value();
+
+  SmallVector<int64_t> blockTileSize;
+  auto linalgOp = cast<linalg::LinalgOp>(op);
+  // blockTileSize gets gemmTileSize's M and N because dimK no need to be tiled
+  if (linalgOp.getNumParallelLoops() == 3) { // BMM
+    blockTileSize = {0, gemmTileSize[0], gemmTileSize[1]};
+  } else { // Matmul
+    blockTileSize = {gemmTileSize[0], gemmTileSize[1]};
+  }
+
+  SmallVector<Value> warpTileSize;
+
+  auto partitionedLoops = getPartitionedLoopsFromLinalgOp(linalgOp);
+
+  auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
+  warpTileSize.resize(cast<TilingInterface>(op).getLoopIteratorTypes().size(),
+                      zero);
+  // if M N is parallel remove the workGroup[2]
+  warpWorkGroup.resize(partitionedLoops.size());
+  // partitionedLoops is M N which needed to be mapped to WarpIdx.y WarpIdx.x
+  std::reverse(warpWorkGroup.begin(), warpWorkGroup.end());
+
+  unsigned idx = 0;
+  for (auto depth : partitionedLoops) {
+    if (depth >= blockTileSize.size()) {
+      continue;
+    }
+    warpTileSize[depth] = builder.create<arith::ConstantIndexOp>(
+        op->getLoc(),
+        llvm::divideCeil(blockTileSize[depth], warpWorkGroup[idx++]));
+  }
+  return warpTileSize;
+}
+
+SmallVector<linalg::ProcInfo, 2> getWarpInfoDistributedToTile(
+    OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges,
+    unsigned warpSize, SmallVector<int64_t, 3> warpWorkGroup) {
+  unsigned parallelDimNum = parallelLoopRanges.size();
+  SmallVector<linalg::ProcInfo, 2> warpProcInfo(parallelDimNum);
+  SmallVector<gpu::Dimension, 3> gpuDimAttrs = {
+      gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z};
+  for (unsigned i = 0; i < parallelDimNum; i++) {
+    Value threadIdx_i = b.create<gpu::ThreadIdOp>(loc, gpuDimAttrs[i]);
+    Value warpIdx_i;
+    if (i == 0) { // warpIdx.x = threadIdx.x / 32
+      mlir::AffineExpr d0 = b.getAffineDimExpr(0);
+      warpIdx_i = affine::makeComposedAffineApply(
+          b, loc, d0.floorDiv(b.getAffineConstantExpr(warpSize)),
+          {threadIdx_i});
+    } else {
+      mlir::AffineExpr d0 = b.getAffineDimExpr(0);
+      warpIdx_i = affine::makeComposedAffineApply(b, loc, d0, {threadIdx_i});
+    }
+    // warpProcInfo[i] mapped to the tiled scf dim
+    warpProcInfo[parallelDimNum - 1 - i] = linalg::ProcInfo{
+        warpIdx_i,
+        b.create<arith::ConstantOp>(loc, b.getIndexAttr(warpWorkGroup[i])),
+        linalg::DistributionMethod::Cyclic};
+  }
+  return warpProcInfo;
+}
+
+LogicalResult distributeToWarpLevel(scf::ForallOp forallOp,
+                                    SmallVector<int64_t, 3> &workGroup) {
+  // Use LinalgTilingOptions to distribute to warp level
+
+  // calculate WarpIdx
+  // WarpIdx.x = threadIdx.x / 32
+  // WarpIdx.y = threadIdx.y
+  // WarpIdx.z = threadIdx.z
+  if (workGroup[0] / warpSize == 0) {
+    return failure();
+  }
+  SmallVector<int64_t, 3> warpWorkGroup = {workGroup[0] / warpSize,
+                                           workGroup[1], workGroup[2]};
+
+  linalg::LinalgTilingOptions linalgTilingOptions =
+      linalg::LinalgTilingOptions();
+
+  auto tileSizeComputationFunction = [=](OpBuilder &b, Operation *op) {
+    return warpTileSizeComputationFunc(warpWorkGroup, b, op).value();
+  };
+  linalgTilingOptions.setTileSizeComputationFunction(
+      tileSizeComputationFunction);
+
+  linalg::LinalgLoopDistributionOptions distributionOptions;
+  auto getWarpProcInfoFn = [=](OpBuilder &b, Location loc,
+                               ArrayRef<Range> parallelLoopRanges) {
+    return getWarpInfoDistributedToTile(b, loc, parallelLoopRanges, warpSize,
+                                        warpWorkGroup);
+  };
+  distributionOptions.procInfo = getWarpProcInfoFn;
+  linalgTilingOptions.setDistributionOptions(distributionOptions)
+      .setLoopType(linalg::LinalgTilingLoopType::Loops);
+
+  SmallVector<linalg::LinalgOp> candidates;
+  forallOp.walk([&](linalg::LinalgOp linalgOp) {
+    if (isa<linalg::FillOp>(linalgOp) || isLinalgMatmul(linalgOp)) {
+      candidates.push_back(linalgOp);
+    }
+  });
+
+  IRRewriter rewriter(forallOp->getContext());
+  for (auto linalgOp : candidates) {
+    FailureOr<linalg::TiledLinalgOp> res =
+        linalg::tileLinalgOp(rewriter, linalgOp, linalgTilingOptions);
+    if (failed(res)) {
+      return failure();
+    }
+    if (res->tensorResults.empty()) {
+      rewriter.eraseOp(linalgOp);
+    } else {
+      rewriter.replaceOp(linalgOp, res->tensorResults);
+    }
+  }
+
+  return success();
+}
+
+struct GPUDistributeToWarpPass
+    : public GPUDistributeToWarpBase<GPUDistributeToWarpPass> {
+public:
+  GPUDistributeToWarpPass() = default;
+
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+
+    if (!funcHasGemm(funcOp)) {
+      return;
+    }
+
+    // get BlockSize from the annotation in funcOp's attribute
+    std::optional<SmallVector<int64_t, 3>> optionalworkGroup =
+        getGemmBlockSize(funcOp);
+    if (!optionalworkGroup.has_value()) {
+      return;
+    }
+    SmallVector<int64_t, 3> workGroup = optionalworkGroup.value();
+
+    std::optional<scf::ForallOp> optionalBlockForallOp =
+        getForallOpMappedToBlock(funcOp);
+    if (!optionalBlockForallOp.has_value()) {
+      return;
+    }
+    scf::ForallOp forallOp = optionalBlockForallOp.value();
+    if (failed(distributeToWarpLevel(forallOp, workGroup))) {
+      return signalPassFailure();
+    }
+
+    {
+      // Apply canonicalization patterns.
+      RewritePatternSet threadTilingCanonicalizationPatterns =
+          linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext());
+      // populateAffineMinSCFCanonicalizationPattern(
+      //     threadTilingCanonicalizationPatterns);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(threadTilingCanonicalizationPatterns)))) {
+        return signalPassFailure();
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::createGPUDistributeToWarpPass() {
+  return std::make_unique<GPUDistributeToWarpPass>();
+}
diff --git a/midend/lib/Dialect/GPU/Transforms/PassDetail.h b/midend/lib/Dialect/GPU/Transforms/PassDetail.h
new file mode 100644
index 0000000000..4d085863c6
--- /dev/null
+++ b/midend/lib/Dialect/GPU/Transforms/PassDetail.h
@@ -0,0 +1,13 @@
+#ifndef DIALECT_GPU_TRANSFORMS_PASSDETAIL_H
+#define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+#define GEN_PASS_CLASSES
+#include "GPU/Passes.h.inc"
+
+} // namespace mlir
+
+#endif
\ No newline at end of file
diff --git a/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp b/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp
new file mode 100644
index 0000000000..d98f27e12d
--- /dev/null
+++ b/midend/lib/Dialect/GPU/Transforms/RemoveReduntantLoops.cpp
@@ -0,0 +1,313 @@
+// Some code comes from
+// compiler/lib/Dialect/SCF/Transforms/RemoveSingleIterationLoop.cpp
+// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+
+#include "GPU/Transforms/RemoveReduntantLoops.h"
+#include "PassDetail.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+#include <utility>
+
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::buddy;
+
+namespace {
+
+using GetMinMaxExprFn =
+    std::function<std::optional<std::pair<AffineExpr, AffineExpr>>(
+        Value value, SmallVectorImpl<Value> &dims,
+        SmallVectorImpl<Value> &symbols)>;
+
+/// The pattern will detect single iteration loops
+/// based on the range returned by the lambda
+/// |getMinMaxFn| for some know values.
+
+struct RemoveSingleIterationLoops : public OpRewritePattern<scf::ForOp> {
+private:
+  /// Compose map with apply affine ops and try to simplify it.
+  static void combineAndSimplifyMap(AffineMap &map,
+                                    SmallVectorImpl<Value> &dims,
+                                    SmallVectorImpl<Value> &symbols) {
+    SmallVector<Value> operands(dims.begin(), dims.end());
+    operands.append(symbols.begin(), symbols.end());
+    // Pull in affine.apply operations and compose them fully into the
+    // result.
+    affine::fullyComposeAffineMapAndOperands(&map, &operands);
+    affine::canonicalizeMapAndOperands(&map, &operands);
+    map = simplifyAffineMap(map);
+    // Assign the results.
+    dims.assign(operands.begin(), operands.begin() + map.getNumDims());
+    symbols.assign(operands.begin() + map.getNumDims(), operands.end());
+  }
+
+  /// Replace dimensions and symbols with known range in the map expression.
+  // TODO: Use core function once the interface using a lambda lands.
+  static AffineMap substituteMin(AffineMap map, SmallVectorImpl<Value> &dims,
+                                 SmallVectorImpl<Value> &symbols,
+                                 GetMinMaxExprFn getMinMaxExpr) {
+    combineAndSimplifyMap(map, dims, symbols);
+
+    auto exprs = llvm::to_vector(map.getResults());
+    for (AffineExpr &expr : exprs) {
+      bool substituted = true;
+      while (substituted) {
+        substituted = false;
+        for (unsigned dimIdx = 0; dimIdx < dims.size(); ++dimIdx) {
+          Value dim = dims[dimIdx];
+          auto minMax = getMinMaxExpr(dim, dims, symbols);
+          if (!minMax)
+            continue;
+          AffineExpr dimExpr = getAffineDimExpr(dimIdx, expr.getContext());
+          // Substitute occurrences of `dimExpr` by either the min expression or
+          // the max expression depending on whether the value is used with a
+          // positive or negative  coefficient.
+          AffineExpr substitutedExpr = affine::substWithMin(
+              expr, dimExpr, minMax->first, minMax->second);
+          substituted = (substitutedExpr != expr);
+          expr = substitutedExpr;
+        }
+        // Substitute symbols
+        for (unsigned symIdx = 0; symIdx < symbols.size(); ++symIdx) {
+          Value sym = symbols[symIdx];
+          auto minMax = getMinMaxExpr(sym, dims, symbols);
+          if (!minMax)
+            continue;
+          AffineExpr symExpr = getAffineSymbolExpr(symIdx, expr.getContext());
+          AffineExpr substitutedExpr = affine::substWithMin(
+              expr, symExpr, minMax->first, minMax->second);
+          substituted = (substitutedExpr != expr);
+          expr = substitutedExpr;
+        }
+      }
+      map = AffineMap::get(dims.size(), symbols.size(), exprs,
+                           exprs.front().getContext());
+      // Cleanup and simplify the results.
+      // This needs to happen outside of the loop iterating on dims.size() since
+      // it modifies dims.
+      combineAndSimplifyMap(map, dims, symbols);
+      // Assign the results.
+      exprs.assign(map.getResults().begin(), map.getResults().end());
+    }
+
+    assert(!exprs.empty() && "Unexpected empty exprs");
+    return AffineMap::get(dims.size(), symbols.size(), exprs, map.getContext());
+  }
+
+  /// Replaces the given op with the contents of the given single-block region,
+  /// using the operands of the block terminator to replace operation results.
+  static void replaceOpWithRegion(PatternRewriter &rewriter, Operation *op,
+                                  Region &region, ValueRange blockArgs = {}) {
+    assert(llvm::hasSingleElement(region) && "expected single-block region");
+    Block *block = &region.front();
+    Operation *terminator = block->getTerminator();
+    ValueRange results = terminator->getOperands();
+    rewriter.inlineBlockBefore(block, op, blockArgs);
+    rewriter.replaceOp(op, results);
+    rewriter.eraseOp(terminator);
+  }
+
+  static void replaceOpWithBlocks(PatternRewriter &rewriter, Operation *op,
+                                  Region &region, ValueRange blockArgs = {}) {
+  }
+
+  /// Return true if we can prove that the we always run at least the first
+  /// iteration of the ForOp.
+  static bool alwaysRunsFirstIteration(scf::ForOp op,
+                                       GetMinMaxExprFn getMinMax) {
+    // Calculate the minimum value of ub - lb. If it is strictly positive it
+    // means the loop will always run at least once.
+    MLIRContext *ctx = op->getContext();
+    SmallVector<Value> dims;
+    SmallVector<Value> symbols;
+    AffineExpr lb = getAffineDimExpr(dims.size(), ctx);
+    dims.push_back(op.getLowerBound());
+    AffineExpr ub = getAffineDimExpr(dims.size(), ctx);
+    dims.push_back(op.getUpperBound());
+    AffineExpr iterZero = ub - lb;
+    auto map = AffineMap::get(dims.size(), 0, iterZero);
+    AffineMap simplifiedMap = substituteMin(map, dims, symbols, getMinMax);
+    assert(simplifiedMap.getNumResults() == 1);
+    auto cst = simplifiedMap.getResult(0).cast<AffineConstantExpr>();
+    if (cst.getValue() > 0)
+      return true;
+    return false;
+  }
+
+  /// Return true if we can prove that the we never run more than one iteration
+  /// of the ForOp.
+  static bool neverRunsSecondIteration(scf::ForOp op,
+                                       GetMinMaxExprFn getMinMax) {
+    // Calculate the minimum of lb + step - ub. If it is positive it means the
+    // loop never run more than once.
+    MLIRContext *ctx = op->getContext();
+    SmallVector<Value> dims;
+    SmallVector<Value> symbols;
+    AffineExpr lb = getAffineDimExpr(dims.size(), ctx);
+    dims.push_back(op.getLowerBound());
+    AffineExpr ub = getAffineDimExpr(dims.size(), ctx);
+    dims.push_back(op.getUpperBound());
+    AffineExpr step = getAffineDimExpr(dims.size(), ctx);
+    dims.push_back(op.getStep());
+    AffineExpr iterOne = lb + step - ub;
+    auto map = AffineMap::get(dims.size(), 0, iterOne);
+    AffineMap simplifiedMap = substituteMin(map, dims, symbols, getMinMax);
+    assert(simplifiedMap.getNumResults() == 1);
+    auto cst = simplifiedMap.getResult(0).cast<AffineConstantExpr>();
+    if (cst.getValue() >= 0)
+      return true;
+    return false;
+  }
+
+public:
+  RemoveSingleIterationLoops(MLIRContext *context, GetMinMaxExprFn getMinMax)
+      : OpRewritePattern<scf::ForOp>(context, 1), getMinMax(getMinMax) {}
+
+  LogicalResult matchAndRewrite(scf::ForOp forOp,
+                                PatternRewriter &rewriter) const override {
+    if (!(alwaysRunsFirstIteration(forOp, getMinMax) &&
+          neverRunsSecondIteration(forOp, getMinMax))) {
+      return failure();
+    }
+    SmallVector<Value> blockArgs;
+    blockArgs.reserve(forOp.getInitArgs().size() + 1);
+    blockArgs.push_back(forOp.getLowerBound());
+    llvm::append_range(blockArgs, forOp.getInitArgs());
+    replaceOpWithRegion(rewriter, forOp, forOp.getRegion(), blockArgs);
+    return success();
+  }
+
+private:
+  GetMinMaxExprFn getMinMax;
+};
+
+} // namespace
+
+namespace {
+
+/// Converts a symbolic GPU processor dimension to its numeric one.
+static unsigned gpuDimToIndex(gpu::Dimension dim) {
+  switch (dim) {
+  case gpu::Dimension::x:
+    return 0;
+  case gpu::Dimension::y:
+    return 1;
+  case gpu::Dimension::z:
+    return 2;
+  default:
+    assert(false && "invalid dimension");
+    return 0;
+  }
+}
+
+/// If the value is a threadID return the range [0, blockGroup-1].
+/// If the number of workgroup is known also return the range of workgroupId ad
+/// workGroup.
+/// As we only use this function in gemm codegen, we we can assume loop variable
+/// is relavant to gpu.threadId or gpu.blockId.
+static std::optional<std::pair<AffineExpr, AffineExpr>>
+getWorkgroupRange(Value processorValue, SmallVectorImpl<Value> & /*dims*/,
+                  SmallVectorImpl<Value> & /*symbols*/,
+                  ArrayRef<int64_t> workGroup, ArrayRef<int64_t> blockGroup) {
+  OpBuilder builder(processorValue.getContext());
+  // If the value is a threadID return the range [0, blockDim.i - 1].
+  if (auto idOp = processorValue.getDefiningOp<gpu::ThreadIdOp>()) {
+    unsigned index = gpuDimToIndex(idOp.getDimension());
+    AffineExpr zero = builder.getAffineConstantExpr(0);
+    AffineExpr ubExpr = builder.getAffineConstantExpr(workGroup[index]);
+    return std::make_pair(zero, ubExpr - 1);
+  }
+  // If the value is a blockDim return the range [blockGroup, blockGroup].
+  if (auto dimOp = processorValue.getDefiningOp<gpu::BlockDimOp>()) {
+    unsigned index = gpuDimToIndex(dimOp.getDimension());
+    AffineExpr bound = builder.getAffineConstantExpr(workGroup[index]);
+    return std::make_pair(bound, bound);
+  }
+  // If the value is a blockID return the range [0, blockGroupSize_i - 1].
+  if (auto idOp = processorValue.getDefiningOp<gpu::BlockIdOp>()) {
+    unsigned index = gpuDimToIndex(idOp.getDimension());
+    AffineExpr zero = builder.getAffineConstantExpr(0);
+    AffineExpr ubExpr = builder.getAffineConstantExpr(blockGroup[index]);
+    return std::make_pair(zero, ubExpr - 1);
+  }
+
+  return std::nullopt;
+}
+
+LogicalResult removeReduntantLoops(func::FuncOp funcOp,
+                                   SmallVector<int64_t, 3> workGroup,
+                                   SmallVector<int64_t> blockGroup) {
+  auto getParallelRangeFn = [=](Value processorValue,
+                                SmallVectorImpl<Value> &dims,
+                                SmallVectorImpl<Value> &symbols) {
+    return getWorkgroupRange(processorValue, dims, symbols, workGroup,
+                             blockGroup);
+  };
+  RewritePatternSet patterns(funcOp->getContext());
+  patterns.add<RemoveSingleIterationLoops>(patterns.getContext(),
+                                           getParallelRangeFn);
+  return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
+}
+
+struct RemoveReduntantLoopsPass
+    : public RemoveReduntantLoopsBase<RemoveReduntantLoopsPass> {
+public:
+  RemoveReduntantLoopsPass() = default;
+
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+
+    if (!funcHasGemm(funcOp)) {
+      return;
+    }
+
+    std::optional<SmallVector<int64_t, 3>> optionalworkGroup =
+        getGemmBlockSize(funcOp);
+    if (!optionalworkGroup.has_value()) {
+      return;
+    }
+    SmallVector<int64_t, 3> workGroup = optionalworkGroup.value();
+
+    std::optional<scf::ForallOp> optionalBlockForallOp =
+        getForallOpMappedToBlock(funcOp);
+    if (!optionalBlockForallOp.has_value()) {
+      return;
+    }
+    scf::ForallOp forallOp = optionalBlockForallOp.value();
+    auto optionalBlockGroups =
+        getConstantIntValues(forallOp.getMixedLowerBound());
+    SmallVector<int64_t> blockGroup = optionalBlockGroups.value();
+    blockGroup.push_back(1);
+    if (failed(removeReduntantLoops(funcOp, workGroup, blockGroup))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::createRemoveReduntantLoops() {
+  return std::make_unique<RemoveReduntantLoopsPass>();
+}
\ No newline at end of file
diff --git a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
index 31f4225e43..811ea12713 100644
--- a/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
+++ b/midend/lib/Dialect/Linalg/Transforms/LinalgPromotion.cpp
@@ -193,11 +193,14 @@ struct LinalgPromotionPass : public LinalgPromotionBase<LinalgPromotionPass> {
         }
 
         promotionImpl<MatmulOperand_C>(b, linalgContractOp);
+        b.setInsertionPoint(linalgContractOp);
+        b.create<gpu::BarrierOp>(linalgContractOp->getLoc());
+        b.setInsertionPointAfter(linalgContractOp);
+        b.create<gpu::BarrierOp>(linalgContractOp->getLoc());
         
-
     }
 };
-} // namespace mlir
+} // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>>
 mlir::createLinalgPromotionPass() {
diff --git a/midend/lib/Pipelines/CMakeLists.txt b/midend/lib/Pipelines/CMakeLists.txt
index 59cd289a1c..a6c36eceb0 100644
--- a/midend/lib/Pipelines/CMakeLists.txt
+++ b/midend/lib/Pipelines/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_library(BuddyPipelines
     MLIRIR
     BuddyGPUPipelines
     BuddyTransformPasses
+    BuddyGPUPasses
     BuddyGemmCodegenUtils
     BuddyPipelineUtils
 )
\ No newline at end of file
diff --git a/midend/lib/Pipelines/LinalgMemrefOpt.cpp b/midend/lib/Pipelines/LinalgMemrefOpt.cpp
index a7d5538162..9d9c3c51da 100644
--- a/midend/lib/Pipelines/LinalgMemrefOpt.cpp
+++ b/midend/lib/Pipelines/LinalgMemrefOpt.cpp
@@ -1,4 +1,6 @@
 #include "Pipelines/LinalgMemrefOpt.h"
+#include "GPU/Transforms/GPUDistributeToWarp.h"
+#include "GPU/Transforms/RemoveReduntantLoops.h"
 #include "Linalg/Transforms/LinalgPromotion.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/PassManager.h"
@@ -17,6 +19,8 @@ void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) {
     pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     pm.addNestedPass<func::FuncOp>(createCSEPass());
     pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createGPUDistributeToWarpPass());
+    pm.addNestedPass<func::FuncOp>(createRemoveReduntantLoops());
 }
 
 void createLinalgMemrefOptPipelineImpl(OpPassManager &pm,