Merge branch 'main' into mm_cascade

Xilinx · May 28, 2024 · 1c2e38d · 1c2e38d
2 parents 9e78f4d + b693d4e
commit 1c2e38d
Show file tree

Hide file tree

Showing 22 changed files with 688 additions and 162 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -180,6 +180,8 @@ find_library(ELF_LIB elf)
 cmake_dependent_option(AIE_ENABLE_AIRBIN
   "Enables emitting AIRBIN ELF binaries." OFF "ELF_LIB" OFF)
 
+# If we need runtime libs, then statically link them.
+set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
 
 add_flag_if_supported("-Werror=sign-compare" WERROR_SIGN_COMPARE)
 add_flag_if_supported("-Werror=unused" WERROR_USED)

diff --git a/aie_runtime_lib/AIE/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE/aiesim/CMakeLists.txt
@@ -7,7 +7,6 @@
 
 
 set(INSTALLS
-    Makefile
     genwrapper_for_ps.cpp)
 
 

diff --git a/aie_runtime_lib/AIE/aiesim/Makefile b/aie_runtime_lib/AIE/aiesim/Makefile
diff --git a/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt
@@ -7,7 +7,6 @@
 
 
 set(INSTALLS
-    Makefile
     genwrapper_for_ps.cpp)
 
 

diff --git a/aie_runtime_lib/AIE2/aiesim/Makefile b/aie_runtime_lib/AIE2/aiesim/Makefile
diff --git a/cmake/toolchainFiles/toolchain_x86_64.cmake b/cmake/toolchainFiles/toolchain_x86_64.cmake
@@ -1,4 +1,7 @@
-# Copyright (C) 2018-2022, Xilinx Inc. All rights reserved.
-# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2018-2024 Advanced Micro Devices, Inc. or its affiliates
 
diff --git a/docs/Building.md b/docs/Building.md
@@ -118,6 +118,11 @@ and llvm.
     source utils/env_setup.sh <mlir-aie>/install <llvm dir>/install
     ```
 
+Note that when coming back to this install with a fresh environment, it is necessary to rerun the `utils/env_setup.sh` script to setup your environment as well as activate the Python virtual environment using the following command.
+```
+source sandbox/bin/activate
+```
+
 ## Building on X86 targetting the VCK5000
 
 In order to build and run on PCIe cards, you first have to build and install the aie-rt library. We chose to install the library in /opt/xaiengine but it is not required for the tools to be installed there. Just ensure that when building mlir-aie and mlir-air, that you point to the directory in which the aie-rt library was installed.
@@ -147,6 +152,8 @@ Then, set `${ROCM_ROOT}` to the ROCm install from the previous path. Then, run t
 
 The PCIe AIR runtime requires the use of the [AIR PCIe kernel driver](https://github.com/Xilinx/ROCm-air-platforms/tree/main/driver). The driver directory in the [ROCm-air-platforms](https://github.com/Xilinx/ROCm-air-platforms) repository contains documentation on how to compile and load the AIR PCIe kernel driver.
 
+After this is complete, refer back to Step 5 of `Building on X86 for mlir-aie development` to setup the rest of your environment.
+
 ### Sysroot
 Since the AIE tools are cross-compiling, in order to actually compile code, we need a 'sysroot' directory,
 containing an ARM rootfs.  This rootfs must match what will be available in the runtime environment.

diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -13,7 +13,7 @@
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h"
 
-#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"

diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2023, Advanced Micro Devices, Inc.
+// (c) Copyright 2023-2024 Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 // This file contains conversions and rewrites to the Vector dialect to make
@@ -39,6 +39,55 @@ using namespace xilinx::aievec;
 //================== Common AIE canonicalization analysis ====================//
 //============================================================================//
 
+static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
+  if (op.getKind() != vector::CombiningKind::ADD)
+    return false;
+
+  // Get and check shape of operands
+  auto lhsShape = op.getLhsType().getShape();
+  auto rhsShape = op.getRhsType().getShape();
+  auto accShape = cast<ShapedType>(op.getAccType()).getShape();
+  if (lhsShape.size() < 2 || rhsShape.size() < 2 || accShape.size() < 2)
+    return false;
+
+  // Check that the innermost iterators match gemm-like iterators
+  SmallVector<vector::IteratorType> iterators = op.getIteratorTypesArray();
+  if (iterators.size() < 3)
+    return false;
+  auto innerMostIterators =
+      SmallVector<vector::IteratorType>(iterators.end() - 3, iterators.end());
+  if (vector::IteratorType::parallel != innerMostIterators[0] ||
+      vector::IteratorType::parallel != innerMostIterators[1] ||
+      vector::IteratorType::reduction != innerMostIterators[2])
+    return false;
+
+  // Get indexing maps of iterators for operands
+  SmallVector<AffineMap, 4> indexingMaps(op.getIndexingMapsArray());
+  SmallVector<int64_t> outerMostResults;
+  for (int64_t i = 0; i < indexingMaps[0].getNumResults() - 2; i++)
+    outerMostResults.push_back(i);
+
+  auto innerLhsMap = indexingMaps[0].dropResults(outerMostResults);
+  auto innerRhsMap = indexingMaps[1].dropResults(outerMostResults);
+  auto innerAccMap = indexingMaps[2].dropResults(outerMostResults);
+
+  // Check whether they conform to a "transposed B" gemm
+  auto ctx = op.getContext();
+  auto mmAidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{1, 0, 2}, ctx)
+          .dropResults(0);
+  auto mmBidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{0, 1, 2}, ctx)
+          .dropResults(0);
+  auto mmCidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{2, 0, 1}, ctx)
+          .dropResults(0);
+  int64_t numOuterMostDims = indexingMaps[0].getNumDims() - 3;
+  return innerLhsMap == mmAidxMap.shiftDims(numOuterMostDims) &&
+         innerRhsMap == mmBidxMap.shiftDims(numOuterMostDims) &&
+         innerAccMap == mmCidxMap.shiftDims(numOuterMostDims);
+}
+
 //============================================================================//
 //============ Common AIE canonicalization conversion patterns ===============//
 //============================================================================//
@@ -411,6 +460,107 @@ struct FlattenMultDimTransferWritePattern
   }
 };
 
+// This pattern takes out an implicit transposition of the `rhs` operand in a
+// gemm-like contraction op, making it an explicit `vector.transpose` op.
+// If `rhs` is coming from a widening op (`extf`/`extsi`/`extui`), the
+// transposition will be hoisted above the widening op.
+struct ExtractTransposeFromContractionOp
+    : public OpConversionPattern<vector::ContractionOp> {
+  using OpConversionPattern<vector::ContractionOp>::OpConversionPattern;
+
+  static VectorType getTransposedVectorType(VectorType vecTy) {
+    SmallVector<int64_t> shape{vecTy.getShape()};
+    auto nDim = shape.size();
+    int64_t dimNm1 = shape[nDim - 1];
+    shape[nDim - 1] = shape[nDim - 2];
+    shape[nDim - 2] = dimNm1;
+    auto elemTy = vecTy.getElementType();
+    return VectorType::get(shape, elemTy);
+  }
+
+  LogicalResult
+  matchAndRewrite(vector::ContractionOp contractOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!isGemmBTransposedContractionOp(contractOp))
+      return failure();
+
+    Location loc = contractOp.getLoc();
+    auto ctx = rewriter.getContext();
+
+    Value rhsVal = adaptor.getRhs();
+    VectorType rhsVecTy = contractOp.getRhsType();
+    Type rhsElemTy = rhsVecTy.getElementType();
+
+    bool doExtF = false, doExtSI = false, doExtUI = false;
+    if (auto extfRhsOp = rhsVal.getDefiningOp<arith::ExtFOp>()) {
+      rhsVal = extfRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtF = true;
+    } else if (auto extsiRhsOp = rhsVal.getDefiningOp<arith::ExtSIOp>()) {
+      rhsVal = extsiRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtSI = true;
+    } else if (auto extuiRhsOp = rhsVal.getDefiningOp<arith::ExtUIOp>()) {
+      rhsVal = extuiRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtUI = true;
+    }
+
+    int64_t nDim = rhsVecTy.getShape().size();
+    SmallVector<int64_t> rhsPermutation;
+    for (int64_t i = 0; i < nDim - 2; i++)
+      rhsPermutation.push_back(i);
+    rhsPermutation.push_back(nDim - 1);
+    rhsPermutation.push_back(nDim - 2);
+    auto transpRhsVecTy = getTransposedVectorType(rhsVecTy);
+    rhsVal = rewriter
+                 .create<vector::TransposeOp>(loc, transpRhsVecTy, rhsVal,
+                                              rhsPermutation)
+                 .getResult();
+
+    if (doExtF)
+      rhsVal =
+          rewriter
+              .create<arith::ExtFOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+    if (doExtSI)
+      rhsVal =
+          rewriter
+              .create<arith::ExtSIOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+    if (doExtUI)
+      rhsVal =
+          rewriter
+              .create<arith::ExtUIOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+
+    SmallVector<AffineMap, 4> oldIdxMaps(contractOp.getIndexingMapsArray());
+
+    nDim = oldIdxMaps[1].getNumDims();
+    SmallVector<int64_t> innerDimPerm;
+    for (int64_t i = 0; i < nDim - 2; i++)
+      innerDimPerm.push_back(i);
+    innerDimPerm.push_back(nDim - 1);
+    innerDimPerm.push_back(nDim - 2);
+    auto transpPermMap = AffineMap::getPermutationMap(innerDimPerm, ctx);
+
+    auto newIdxMaps = rewriter.getAffineMapArrayAttr(
+        {oldIdxMaps[0], oldIdxMaps[1].compose(transpPermMap), oldIdxMaps[2]});
+
+    rewriter.replaceOpWithNewOp<vector::ContractionOp>(
+        contractOp, contractOp.getResult().getType(), adaptor.getLhs(), rhsVal,
+        adaptor.getAcc(), newIdxMaps, contractOp.getIteratorTypes());
+
+    return success();
+  }
+};
+
 //============================================================================//
 //============ AIEML canonicalization conversion patterns ===============//
 //============================================================================//
@@ -470,15 +620,20 @@ static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target,
       [](vector::TransferWriteOp op) {
         return cast<VectorType>(op.getVector().getType()).getRank() < 2;
       });
+  target.addDynamicallyLegalOp<vector::ContractionOp>(
+      [](vector::ContractionOp op) {
+        return !isGemmBTransposedContractionOp(op);
+      });
 }
 
 static void
 populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns,
                                             TargetBackend backend) {
   patterns.add<SplitUnalignedTransferReadPattern>(patterns.getContext(), 1024,
                                                   256);
-  patterns.add<FlattenMultDimTransferReadPattern,
-               FlattenMultDimTransferWritePattern>(patterns.getContext());
+  patterns
+      .add<ExtractTransposeFromContractionOp, FlattenMultDimTransferReadPattern,
+           FlattenMultDimTransferWritePattern>(patterns.getContext());
 }
 
 //============================================================================//

diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -6,7 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-aie.device(npu) {
+aie.device(npu1_3col) {
 
   //shim
   %tile00 = aie.tile(0, 0)