diff --git a/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt b/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt
new file mode 100644
index 0000000000..570641cef5
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/Analysis/CMakeLists.txt
@@ -0,0 +1,12 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2022 Xilinx Inc.
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name AIEVecAnalysis)
+add_public_tablegen_target(MLIRAIEVecAnalysisPassIncGen)
+
+add_mlir_doc(Passes AIEVecAnalysisPasses ./ -gen-pass-doc)
diff --git a/include/aie/Dialect/AIEVec/Analysis/Passes.h b/include/aie/Dialect/AIEVec/Analysis/Passes.h
new file mode 100644
index 0000000000..de1dc35cef
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/Analysis/Passes.h
@@ -0,0 +1,46 @@
+//===- Passes.h - AIE Vector Passes -----------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022 Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+// Register all the AIE vectorization passes
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H
+#define AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
+#include <limits>
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace func {
+class FuncOp;
+} // namespace func
+} // namespace mlir
+
+namespace xilinx {
+namespace aievec {
+
+#define GEN_PASS_DECL
+#define GEN_PASS_CLASSES
+#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc"
+
+std::unique_ptr<Pass> createAIEVecConvolutionAnalysisPass();
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc"
+
+} // end namespace aievec
+} // end namespace xilinx
+
+#endif // AIE_DIALECT_AIEVEC_ANALYSIS_PASSES_H
diff --git a/include/aie/Dialect/AIEVec/Analysis/Passes.td b/include/aie/Dialect/AIEVec/Analysis/Passes.td
new file mode 100644
index 0000000000..078504df4e
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/Analysis/Passes.td
@@ -0,0 +1,28 @@
+//=== Passes.td - AIE vector analysis pass definition file -*- tablegen -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2023 AMD Inc.
+//
+//===----------------------------------------------------------------------===//
+// This file contains definitions for passes within the AIEVec/ directory.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_DIALECT_AIEVEC_ANALYSIS_PASSES
+#define AIE_DIALECT_AIEVEC_ANALYSIS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def AIEVecConvAnalysis : Pass<"aievec-convolution-analysis", "mlir::func::FuncOp"> {
+  let summary = "Find MAC chains that can be replaced by convolution ops in "
+                "AIE-ML";
+  let constructor = "xilinx::aievec::createAIEVecConvolutionAnalysisPass()";
+  let options = [
+    Option<"printResult", "print", "bool", /*default=*/"false",
+      "Print the result of the analysis">,
+  ];
+}
+
+#endif // AIE_DIALECT_AIEVEC_ANALYSIS_PASSES
diff --git a/include/aie/Dialect/AIEVec/CMakeLists.txt b/include/aie/Dialect/AIEVec/CMakeLists.txt
index 1c3baf43bc..0603587912 100644
--- a/include/aie/Dialect/AIEVec/CMakeLists.txt
+++ b/include/aie/Dialect/AIEVec/CMakeLists.txt
@@ -5,5 +5,6 @@
 #
 # (c) Copyright 2022 Xilinx Inc.
 
+add_subdirectory(Analysis)
 add_subdirectory(IR)
 add_subdirectory(Transforms)
diff --git a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
index 16b25df533..006ffaa9a2 100644
--- a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
+++ b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
@@ -1,4 +1,5 @@
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
+#include "aie/Dialect/AIEVec/Analysis/Passes.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -192,6 +193,10 @@ populateAIEVecV2TransformationPatterns(RewritePatternSet &patterns) {
   patterns.add<FoldAIEShiftAndBroadcast>(patterns.getContext());
 }
 
+//===----------------------------------------------------------------------===//
+// Legalizations
+//===----------------------------------------------------------------------===//
+
 static void
 configureAIEVecV1TransformationLegalizations(ConversionTarget &target) {
   target.addLegalDialect<aievec::AIEVecDialect>();
@@ -202,9 +207,6 @@ configureAIEVecV1TransformationLegalizations(ConversionTarget &target) {
   });
 }
 
-//===----------------------------------------------------------------------===//
-// Legalizations
-//===----------------------------------------------------------------------===//
 static void
 configureAIEVecV2TransformationLegalizations(ConversionTarget &target) {
   target.addDynamicallyLegalOp<xilinx::aievec::BroadcastOp>(
@@ -373,9 +375,11 @@ void xilinx::aievec::buildOptimizeAIEVec(OpPassManager &pm,
   pm.addPass(createCSEPass());
   pm.addPass(createCanonicalizerPass());
 
-  // TODO: This pass should only be included if the target is AIEML.
   // Add generating aievec convolution ops pass
-  pm.addPass(createAIEVecConvOpTransformationPass(options));
+  if (options.aieTarget == "aieml") {
+    pm.addPass(createAIEVecConvolutionAnalysisPass());
+    pm.addPass(createAIEVecConvOpTransformationPass(options));
+  }
 
   // Add post-lowering canonicalization passes.
   pm.addPass(createCSEPass());
diff --git a/lib/Dialect/AIEVec/Transforms/CMakeLists.txt b/lib/Dialect/AIEVec/Transforms/CMakeLists.txt
index 6a8fbc686b..88d414e96a 100644
--- a/lib/Dialect/AIEVec/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AIEVec/Transforms/CMakeLists.txt
@@ -16,9 +16,11 @@ add_mlir_dialect_library(MLIRAIEVecTransforms
 
   ADDITIONAL_HEADER_DIRS
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/aie/Dialect/AIEVec/Transforms
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/aie/Dialect/AIEVec/Analysis
 
   DEPENDS
   MLIRAIEVecPassIncGen
+  MLIRAIEVecAnalysisPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp b/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp
index 34ef680bab..ba83c28822 100644
--- a/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp
+++ b/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp
@@ -45,6 +45,10 @@ using namespace xilinx::aievec;
 
 #define DEBUG_TYPE "vector-to-aievec-conversion"
 
+//===----------------------------------------------------------------------===//
+// Rewrite patterns
+//===----------------------------------------------------------------------===//
+
 template <typename OpTy>
 struct SetInboundsToReadStoreOpPattern : public RewritePattern {
   SetInboundsToReadStoreOpPattern(MLIRContext *context)
@@ -73,6 +77,10 @@ struct SetInboundsToReadStoreOpPattern : public RewritePattern {
 using SetInboundsToReadOp = SetInboundsToReadStoreOpPattern<TransferReadOp>;
 using SetInboundsToWriteOp = SetInboundsToReadStoreOpPattern<TransferWriteOp>;
 
+//===----------------------------------------------------------------------===//
+// Lowering passes
+//===----------------------------------------------------------------------===//
+
 struct RedundantLoadStoreOptimizationPass
     : public PassWrapper<RedundantLoadStoreOptimizationPass,
                          OperationPass<func::FuncOp>> {
@@ -111,7 +119,6 @@ void xilinx::aievec::buildConvertVectorToAIEVec(
   // NOTE: This sub-pipeline ingests arbitrary MLIR Vector code.
   buildCanonicalizeVectorForAIEVec(
       pm, options.getCanonicalizeVectorForAIEVecOptions());
-
   // NOTE: At this stage, all the Vector code in the IR can be mapped
   // HOTE: to AIEVec operations.
 
@@ -122,7 +129,6 @@ void xilinx::aievec::buildConvertVectorToAIEVec(
   // NOTE: This sub-pipeline ingests MLIR Vector code that can be mapped to
   // NOTE: AIEVec operations.
   buildLowerVectorToAIEVec(pm, options.getLowerVectorToAIEVecOptions());
-
   // NOTE: At this stage, all vector operations are expressed in AIEVec dialect.
 
   //============================================================================
diff --git a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
index 987fe6014c..77c1aaf30c 100644
--- a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
+++ b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
@@ -12,11 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
+#include "aie/Dialect/AIEVec/Analysis/Passes.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/AnalysisManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <tuple>
+#include <utility>
 
 #include "FoldMulAddChainToConvOp.h"
 
@@ -26,367 +29,327 @@ using namespace vector;
 using namespace xilinx;
 using namespace xilinx::aievec;
 
-typedef std::tuple<int8_t, aievec::UPDOp, arith::MulIOp> MulDefTupleTy;
-using MulDefTupleVecTy = SmallVector<MulDefTupleTy, 8>;
-using MulDefMapTy = DenseMap<Value, MulDefTupleVecTy>;
-
-// If only one of the operands of given add is an add, return that operand's def
-// op; otherwise return null.
-arith::AddIOp getDefAddOp(arith::AddIOp addOp) {
-  auto defLhs = dyn_cast<arith::AddIOp>(addOp->getOperand(0).getDefiningOp());
-  auto defRhs = dyn_cast<arith::AddIOp>(addOp->getOperand(1).getDefiningOp());
-  if ((!defLhs && !defRhs) || (defLhs && defRhs)) {
-    return nullptr;
-  }
-  return defLhs ? defLhs : defRhs;
-}
-
-// Return true if one of the operands of given mul op is a broadcast of a upd op
-// and another operand of the mul op is a upd op. In this case, argument book
-// keeps arguments. Otherwise, return false and leave book keeping unchanged.
-bool checkChainPattern(arith::MulIOp mulOp, MulDefMapTy &macChainMap,
-                       SmallVectorImpl<Value> &bcastOpSourceVec) {
-  aievec::BroadcastOp bcastOp = nullptr;
-  aievec::UPDOp updOp = nullptr;
-
-  if (isa<aievec::BroadcastOp>(mulOp.getOperand(0).getDefiningOp())) {
-    bcastOp = cast<aievec::BroadcastOp>(mulOp->getOperand(0).getDefiningOp());
-    if (!isa<aievec::UPDOp>(mulOp->getOperand(1).getDefiningOp())) {
-      return false;
-    }
-    updOp = cast<aievec::UPDOp>(mulOp->getOperand(1).getDefiningOp());
-  } else if (isa<aievec::BroadcastOp>(mulOp.getOperand(1).getDefiningOp())) {
-    bcastOp = cast<aievec::BroadcastOp>(mulOp->getOperand(1).getDefiningOp());
-    if (!isa<aievec::UPDOp>(mulOp->getOperand(0).getDefiningOp())) {
-      return false;
-    }
-    updOp = cast<aievec::UPDOp>(mulOp->getOperand(0).getDefiningOp());
-  } else {
-    return false;
-  }
+namespace xilinx::aievec {
+#define GEN_PASS_DEF_AIEVECCONVANALYSIS
+#include "aie/Dialect/AIEVec/Analysis/Passes.h.inc"
+} // namespace xilinx::aievec
 
-  if (!isa<aievec::UPDOp>(bcastOp.getSource().getDefiningOp())) {
-    return false;
-  }
-
-  if (!macChainMap.count(bcastOp.getSource())) {
-    bcastOpSourceVec.push_back(bcastOp.getSource());
-    MulDefTupleVecTy tupleVec;
-    tupleVec.push_back(std::make_tuple(bcastOp.getIdx(), updOp, mulOp));
-    macChainMap.insert(std::make_pair(bcastOp.getSource(), tupleVec));
-  } else {
-    macChainMap[bcastOp.getSource()].push_back(
-        std::make_tuple(bcastOp.getIdx(), updOp, mulOp));
-  }
-  return true;
-}
-
-// The defs of mul ops consist of an upd op and a broadcast op.
-// The chain map looks like below:
-// | BroadcastOp source | vector<tuple<broadcastOp idx, UPDOp, MulIOp>> |
-// The mul add op chain can be grouped by broadcast op's source.
-// For each group, broadcastOp idx can be sorted to find the start of the
-// memrefs used by broadcast op and upd op.
-void buildChainMap(arith::AddIOp curAddOp, bool &hasMulConv, Value &acc,
-                   MulDefMapTy &macChainMap,
-                   SmallVectorImpl<Value> &bcastOpSourceVec) {
-  while (true) {
-    auto defLhs =
-        dyn_cast<arith::MulIOp>(curAddOp->getOperand(0).getDefiningOp());
-    auto defRhs =
-        dyn_cast<arith::MulIOp>(curAddOp->getOperand(1).getDefiningOp());
-
-    if (!defLhs && !defRhs) {
-      break;
-    }
-    // If both ops of add op are mul ops, this will reach the top of the
-    // chain. Check the legality for both mul op and insert them to the chain
-    // map.
-    else if (defLhs && defRhs) {
-      if (!checkChainPattern(defLhs, macChainMap, bcastOpSourceVec) ||
-          !checkChainPattern(defRhs, macChainMap, bcastOpSourceVec)) {
-        break;
-      }
-      hasMulConv = true;
+/// This analysis builds the longest possible chain of MAC operations whose
+/// operands are a vector that may or may not be shifted, and a broadcast.
+/// That is, these MACs represent `vector x scalar` ops, and are candidates to
+/// be grouped and replaced by mul_conv/fma_conv ops in AIE-ML.
+//
+// We build this chain recursively, climbing up the
+struct LongestConvMACChainAnalysis {
+  static AnalysisManager *am;
+
+  struct ConvMac {
+    // If there's a non-accumulating convolution upchain,
+    // store it here temorarily.
+    std::unique_ptr<ConvMac> topOfChainMulConv;
+    // Accumulator value, if there is one.
+    Value acc;
+    // Left-hand side (non-broadcasting) source value
+    Value lhs;
+    // Left-hand side (broadcasting) source value
+    Value rhs;
+    // Amount that lhs is shifted
+    uint8_t shift;
+    // Element in rhs that is broadcasted
+    uint8_t bcastIdx;
+    ConvMac(Value lhs, Value rhs, uint8_t shift, uint8_t bcastIdx)
+        : topOfChainMulConv(nullptr), acc(nullptr), lhs(lhs), rhs(rhs),
+          shift(shift), bcastIdx(bcastIdx) {}
+  };
+
+  struct ConvMacChainGroup {
+    // Group start index within the chain
+    uint64_t fromIdx;
+    // Index in chain after group last MAC
+    uint64_t toIdx;
+    // Initial position of the signal to be convolved
+    int64_t signalShift;
+    // Initial position of the convolution filter
+    int64_t bcastShift;
+    // Distance between elements in the filter
+    int64_t bcastDist; // Must be 1 or 2
+  };
+
+  typedef SmallVector<std::unique_ptr<ConvMac>, 8> ConvMacChain;
+  typedef SmallVector<ConvMacChainGroup, 8> ConvMacChainGroupList;
+
+  std::unique_ptr<ConvMacChain> convMacChain;
+  ConvMacChainGroupList groupsInChain;
+
+  /// Sort the chain of MACs by sources. When two MACs share the same sources,
+  /// sort them by the broadcast index. If they don't, sort them by the order
+  /// of the ops in the code. This function should be called after the chain
+  /// is completed, and before operating on the groups of MACs. After sorting,
+  /// MACs that can be fused into single convolution ops will be contiguous in
+  /// the chain.
+  void sortChain() {
+    if ((*convMacChain)[0]->acc) {
+      std::sort(convMacChain->begin(), convMacChain->end(),
+                [](const auto &a, const auto &b) {
+                  if (a->lhs == b->lhs) {
+                    if (a->rhs == b->rhs)
+                      return a->bcastIdx < b->bcastIdx;
+                    return a->rhs.getDefiningOp()->isBeforeInBlock(
+                        b->rhs.getDefiningOp());
+                  }
+                  // We should probably sort by lhs load address, if it exists
+                  // XXX: We assume all MACs in the same block. If they're not,
+                  // XXX: this will assert.
+                  return a->lhs.getDefiningOp()->isBeforeInBlock(
+                      b->lhs.getDefiningOp());
+                });
     } else {
-      arith::MulIOp curMulOp = defLhs ? defLhs : defRhs;
-      if (!checkChainPattern(curMulOp, macChainMap, bcastOpSourceVec)) {
-        break;
-      }
-      acc = defLhs ? curAddOp->getOperand(1) : curAddOp->getOperand(0);
-    }
-
-    // Get the def add op the curOp operands
-    arith::AddIOp defAddOp = getDefAddOp(curAddOp);
-
-    // The user/consumer user operation must be an add op, belonging to
-    // the same basic block as curOp.
-    if (!defAddOp || !defAddOp->hasOneUse() ||
-        curAddOp->getBlock() != defAddOp->getBlock()) {
-      break;
+      // If the top of the chain is not an accumulation, bring up all related
+      // convolution MACs and sort the rest by lhs.
+      auto firstLhs = (*convMacChain)[0]->lhs;
+      std::sort(convMacChain->begin(), convMacChain->end(),
+                [&firstLhs](const auto &a, const auto &b) {
+                  if (a->lhs == b->lhs) {
+                    if (a->rhs == b->rhs)
+                      return a->bcastIdx < b->bcastIdx;
+                    return a->rhs.getDefiningOp()->isBeforeInBlock(
+                        b->rhs.getDefiningOp());
+                  }
+                  if (a->lhs == firstLhs)
+                    return true;
+                  if (b->lhs == firstLhs)
+                    return false;
+                  return a->lhs.getDefiningOp()->isBeforeInBlock(
+                      b->lhs.getDefiningOp());
+                });
+      // Float the empty accumulator to the top.
+      if ((*convMacChain)[0]->acc)
+        for (auto &convMac : *convMacChain)
+          if (!convMac->acc) {
+            std::swap((*convMacChain)[0]->acc, convMac->acc);
+            break;
+          }
     }
-    curAddOp = defAddOp;
   }
-}
-
-void refreshFusedGroups(
-    MulDefTupleTy defTuple, arith::MulIOp nextMulOp,
-    SmallVector<arith::MulIOp, 8> &fusedOps,
-    SmallVectorImpl<SmallVector<arith::MulIOp, 8>> &groupFusedOps,
-    int8_t &curIdx, aievec::UPDOp &curUpdOp, arith::MulIOp &curMulOp) {
-  groupFusedOps.push_back(fusedOps);
-  fusedOps.clear();
-  fusedOps.push_back(nextMulOp);
-  std::tie(curIdx, curUpdOp, curMulOp) = defTuple;
-}
-
-// Check whether mul add chain is valid for the transformation and classify the
-// fused ops into different groups with valid constant memref distances.
-bool collectFusedOps(
-    unsigned maxGroupSize, unsigned &dupFactor,
-    SmallVectorImpl<Value> &bcastOpSourceVec,
-    SmallVectorImpl<SmallVector<arith::MulIOp, 8>> &groupFusedOps,
-    MulDefMapTy &macChainMap) {
-  int xDist = -1, zDist = -1;
-  for (auto item : bcastOpSourceVec) {
-    auto macChain = macChainMap[item];
-    std::sort(macChain.begin(), macChain.end());
-    int8_t curIdx = 0;
-    aievec::UPDOp curUpdOp = nullptr;
-    arith::MulIOp curMulOp = nullptr;
-    std::tie(curIdx, curUpdOp, curMulOp) = *macChain.begin();
-    SmallVector<int32_t, 2> dists;
-    SmallVector<arith::MulIOp, 8> fusedOps;
-    fusedOps.push_back(curMulOp);
-
-    for (auto it = std::next(macChain.begin()); it != macChain.end(); ++it) {
-      int8_t nextIdx = 0;
-      aievec::UPDOp nextUpdOp = nullptr;
-      arith::MulIOp nextMulOp = nullptr;
-      MulDefTupleTy defTuple = *it;
-      std::tie(nextIdx, nextUpdOp, nextMulOp) = defTuple;
-
-      int32_t dist = nextIdx - curIdx;
-
-      // Target AIE-ML intrinsic mac_conv_32x8 for v32int8 type and
-      // mac_conv_16x4 for v16int16 type. Thus, the distance of broadcast op
-      // source between two mul add ops cannot be larger than 32/8 or 16/4,
-      // which is 4. If dist is larger than 1, we need to shuffle the load to
-      // get the elements with the interval of dist.
-      if (dist > 4) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      dists.push_back(dist);
-      if (curUpdOp.getSource() != nextUpdOp.getSource()) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      MemRefType curMemRefType =
-          cast<MemRefType>(curUpdOp.getSource().getType());
-      MemRefType nextMemRefType =
-          cast<MemRefType>(nextUpdOp.getSource().getType());
-
-      ArrayRef<int64_t> curSizes = curMemRefType.getShape();
-      ArrayRef<int64_t> nextSizes = nextMemRefType.getShape();
-      if (curSizes.size() != nextSizes.size()) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      AffineExpr curLinearAccess =
-          constructLinearizedAffineExprForUPDOp(curUpdOp);
-      AffineExpr nextLinearAccess =
-          constructLinearizedAffineExprForUPDOp(nextUpdOp);
-      if (!curLinearAccess || !nextLinearAccess) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      AffineExpr curBase, nextBase;
-      int32_t curOffset, nextOffset;
-
-      // Get the base and offset from linear access expr
-      std::tie(curBase, curOffset) = extractBaseAndOffset(curLinearAccess);
-      std::tie(nextBase, nextOffset) = extractBaseAndOffset(nextLinearAccess);
-      if (curBase != nextBase) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      dist = nextOffset - curOffset;
-      if (dist != 1) {
-        if (fusedOps.size() < 2) {
-          return false;
-        }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-      dists.push_back(dist);
 
-      if ((xDist != -1 && xDist != dists[0]) ||
-          (zDist != -1 && zDist != dists[1])) {
-        if (fusedOps.size() < 2) {
-          return false;
+  // Return the list of convolution mac ops in the chain as pairs of indices
+  // indicating the position within the chain where a group starts and the
+  // position where it ends: [start, end). If they have not been precomputed
+  // yet, this method will generate them.
+  const ConvMacChainGroupList &getGroupsInChain() {
+    // If there's no group or it's been computed already, return stored list.
+    if (groupsInChain.size() > 0 || !convMacChain || convMacChain->size() == 0)
+      return groupsInChain;
+
+    uint64_t grpStartIdx = 0;
+    uint64_t grpCurIdx = 0;
+    Value curLhs = (*convMacChain)[0]->lhs;
+    Value curRhs = (*convMacChain)[0]->rhs;
+    for (const auto &convMac : *convMacChain) {
+      if (grpCurIdx > grpStartIdx) {
+        if (curLhs != convMac->lhs || curRhs != convMac->rhs) {
+          groupsInChain.push_back({grpStartIdx, grpCurIdx,
+                                   getGroupSignalShift(grpStartIdx, grpCurIdx),
+                                   getGroupBcastShift(grpStartIdx, grpCurIdx),
+                                   getGroupBcastDist(grpStartIdx, grpCurIdx)});
+          grpStartIdx = grpCurIdx;
+          curLhs = convMac->lhs;
+          curRhs = convMac->rhs;
         }
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
-      }
-
-      xDist = dists[0];
-      zDist = dists[1];
-      dupFactor = dists[0];
-
-      fusedOps.push_back(nextMulOp);
-      std::tie(curIdx, curUpdOp, curMulOp) = defTuple;
-
-      if (fusedOps.size() > maxGroupSize) {
-        fusedOps.pop_back();
-        refreshFusedGroups(defTuple, nextMulOp, fusedOps, groupFusedOps, curIdx,
-                           curUpdOp, curMulOp);
-        continue;
       }
+      grpCurIdx++;
     }
-    groupFusedOps.push_back(fusedOps);
+    if (grpStartIdx < grpCurIdx)
+      groupsInChain.push_back({grpStartIdx, grpCurIdx,
+                               getGroupSignalShift(grpStartIdx, grpCurIdx),
+                               getGroupBcastShift(grpStartIdx, grpCurIdx),
+                               getGroupBcastDist(grpStartIdx, grpCurIdx)});
+    return groupsInChain;
   }
-  return true;
-}
-
-struct canFoldMulAddChainToConvOpAnalysis {
-  canFoldMulAddChainToConvOpAnalysis(arith::AddIOp addOp) {
-    if (!isa<VectorType>(addOp.getType())) {
-      canFoldMulAddChainToConvOp = false;
-      return;
-    }
 
-    VectorType resultType = cast<VectorType>(addOp.getResult().getType());
+  // Return the signal shift for the group in the MAC chain in [fromIdx, toIdx)
+  // the top. This method verifies that the elements of the signal are
+  // contiguously accessed. If they do not, or the specified group doesn't
+  // exist, this function returns -1.
+  int64_t getGroupSignalShift(uint64_t fromIdx, uint64_t toIdx) {
+    if (fromIdx >= toIdx || toIdx > convMacChain->size())
+      return -1;
+    if (toIdx == fromIdx + 1)
+      return static_cast<int64_t>((*convMacChain)[fromIdx]->shift);
+    for (uint64_t i = fromIdx; i < toIdx - 1; i++)
+      if ((static_cast<int64_t>((*convMacChain)[i + 1]->shift) -
+           static_cast<int64_t>((*convMacChain)[i]->shift)) != 1)
+        return -1;
+    return static_cast<int64_t>((*convMacChain)[fromIdx]->shift);
+  }
 
-    if (!resultType.getElementType().isa<IntegerType>()) {
-      canFoldMulAddChainToConvOp = false;
-      return;
-    }
+  // Return the shift in value of the first broadcasted element in the i-th
+  // group. If there is no chain, or the i-th group does not exist,
+  // returns -1.
+  int64_t getGroupBcastShift(uint64_t fromIdx, uint64_t toIdx) {
+    if (fromIdx >= toIdx || toIdx > convMacChain->size())
+      return -1;
+    return static_cast<int64_t>((*convMacChain)[fromIdx]->bcastIdx);
+  }
 
-    IntegerType resultElType = cast<IntegerType>(resultType.getElementType());
-    unsigned resultElWidth = resultElType.getWidth();
-    unsigned laneSize = getVectorLaneSize(resultType);
+  // Returns the broadcast distance between elements within the group. If the
+  // distance is not constant and equal to 1 or 2, it returns -1.
+  int64_t getGroupBcastDist(uint64_t fromIdx, uint64_t toIdx) {
+    if (fromIdx >= toIdx || toIdx > convMacChain->size())
+      return -1;
+    if (toIdx == fromIdx + 1)
+      return 1;
+    int64_t bcastDist =
+        static_cast<int64_t>((*convMacChain)[fromIdx + 1]->bcastIdx) -
+        static_cast<int64_t>((*convMacChain)[fromIdx]->bcastIdx);
+    if (bcastDist != 1 && bcastDist != 2)
+      return -1;
+    for (uint64_t i = fromIdx + 1; i < toIdx - 1; i++)
+      if ((static_cast<int64_t>((*convMacChain)[i + 1]->bcastIdx) -
+           static_cast<int64_t>((*convMacChain)[i]->bcastIdx)) != bcastDist)
+        return -1;
+    return bcastDist;
+  }
 
-    if ((laneSize != 32 || resultElWidth != 8) &&
-        (laneSize != 16 || resultElWidth != 16)) {
-      canFoldMulAddChainToConvOp = false;
-      return;
-    }
+  bool canChainBeReplacedWithConvOps() {
+    const auto &groups = getGroupsInChain();
+    if (groups.size() == 0)
+      return false;
+    for (const auto &group : groups)
+      if (group.signalShift == -1 || group.bcastShift == -1 ||
+          group.bcastDist == -1)
+        return false;
+    return true;
+  }
 
-    if (!addOp->hasOneUse()) {
-      canFoldMulAddChainToConvOp = false;
-      return;
+  std::unique_ptr<ConvMac> getConvMacFromMulOp(arith::MulIOp mulOp) {
+    auto mulOpLhsDefOp = mulOp.getLhs().getDefiningOp();
+    auto mulOpRhsDefOp = mulOp.getRhs().getDefiningOp();
+    if (!mulOpLhsDefOp || !mulOpRhsDefOp)
+      return nullptr;
+
+    // Obtain the broadcast operation feeding into the MulIOp
+    auto bcastOp = dyn_cast<aievec::BroadcastOp>(mulOpRhsDefOp);
+    if (!bcastOp) {
+      bcastOp = dyn_cast<aievec::BroadcastOp>(mulOpLhsDefOp);
+      std::swap(mulOpLhsDefOp, mulOpRhsDefOp);
     }
-
-    // Search for the last add op in the block.
-    auto usrOp = *addOp->getUsers().begin();
-    if (!usrOp || isa<arith::AddIOp>(usrOp)) {
-      canFoldMulAddChainToConvOp = false;
-      return;
+    if (!bcastOp)
+      return nullptr;
+
+    // Obtain the ext or ext->shift op feeding into the MulIOp
+    aievec::ExtOp extOp = nullptr;
+    aievec::ShiftOp shiftOp = nullptr;
+    shiftOp = dyn_cast<aievec::ShiftOp>(mulOpLhsDefOp);
+    if (shiftOp)
+      extOp = shiftOp.getLhs().getDefiningOp<aievec::ExtOp>();
+    else
+      extOp = dyn_cast<aievec::ExtOp>(mulOpLhsDefOp);
+
+    // XXX: Actually, ExtOp might not exist but should work anyway.
+    // XXX: Should it, though?
+    if (!extOp)
+      return nullptr;
+
+    Value lhs = extOp.getSource();
+    Value rhs = bcastOp.getSource();
+    uint8_t shift = 0;
+    if (shiftOp) {
+      auto shiftConstDefOp =
+          shiftOp.getShift().getDefiningOp<arith::ConstantOp>();
+      if (shiftConstDefOp) {
+        auto shiftAttr = cast<IntegerAttr>(shiftConstDefOp.getValue());
+        auto vType = cast<VectorType>(mulOp.getResult().getType());
+        shift = 8 * shiftAttr.getInt() / getElementSizeInBits(vType);
+      }
     }
+    uint8_t bcastIdx = bcastOp.getIdx();
+    return std::make_unique<ConvMac>(lhs, rhs, shift, bcastIdx);
+  }
 
-    arith::AddIOp curAddOp = addOp;
-    // bcastOpSourceVec is a container to trace the order of broadcast ops'
-    // source in the chain.
-    SmallVector<Value, 8> bcastOpSourceVec;
-
-    // Identify the chain and build a mul add Chain map by recording the def of
-    // mul ops.
-    buildChainMap(curAddOp, hasMulConv, acc, macChainMap, bcastOpSourceVec);
-
-    if (macChainMap.empty() ||
-        std::any_of(macChainMap.begin(), macChainMap.end(),
-                    [](const auto &p) { return p.second.size() < 2; })) {
-      canFoldMulAddChainToConvOp = false;
-      return;
+  std::unique_ptr<ConvMac> getConvMacFromAddOp(arith::AddIOp addOp) {
+    // Make sure at least one of them is a multiplication, and the other one
+    // is the accumulator coming form upchain.
+    auto mulOp = addOp.getLhs().getDefiningOp<arith::MulIOp>();
+    Value acc = addOp.getRhs();
+    if (!mulOp) {
+      mulOp = addOp.getRhs().getDefiningOp<arith::MulIOp>();
+      acc = addOp.getLhs();
     }
-
-    // Since we trace the order forwards, now reverse the vector.
-    std::reverse(bcastOpSourceVec.begin(), bcastOpSourceVec.end());
-
-    auto getConstantIdx = [](Value v) {
-      aievec::UPDOp bcastUPDOp = cast<aievec::UPDOp>(v.getDefiningOp());
-      SmallVector<Value, 4> indices(bcastUPDOp.getIndices().begin(),
-                                    bcastUPDOp.getIndices().end());
-      Value innerMostIdx = indices[indices.size() - 1];
-      int64_t val = -1;
-      if (auto idxDefOp = innerMostIdx.getDefiningOp()) {
-        if (auto constOp = dyn_cast<arith::ConstantOp>(idxDefOp)) {
-          val = cast<IntegerAttr>(constOp.getValue()).getInt();
+    if (!mulOp)
+      return nullptr;
+
+    // Get the parameters of the convolution from the operands of the MulIOp
+    auto convMac = getConvMacFromMulOp(mulOp);
+    if (!convMac)
+      return nullptr;
+
+    // If both sides are MulIOp, we might be at the top of the chain
+    auto upChainAccMulOp = acc.getDefiningOp<arith::MulIOp>();
+    if (upChainAccMulOp) {
+      auto convMac2 = getConvMacFromMulOp(upChainAccMulOp);
+      // XXX: We pre-sort the top two MACs to make sure that an undefined
+      // XXX: accumulator ends up on top of the chain.
+      // XXX: But it might not be necessary? CHECK!
+      if (convMac2 && convMac->lhs == convMac2->lhs &&
+          convMac->rhs == convMac->rhs) {
+        if (convMac->bcastIdx < convMac2->bcastIdx &&
+            convMac->shift < convMac2->shift) {
+          convMac2->topOfChainMulConv = std::move(convMac);
+          convMac2->acc = acc;
+          return convMac2;
+        } else if (convMac->bcastIdx > convMac2->bcastIdx &&
+                   convMac->shift > convMac2->shift) {
+          convMac->topOfChainMulConv = std::move(convMac2);
+          convMac->acc = acc;
+          return convMac;
+        } else {
+          // WARNING: In this situation, the chain is ambiguous and picking one
+          // WARNING: option over the other may result in a successful
+          // WARNING: and/or better replacement. Here, we are assuming that
+          // WARNING: is going to be either one or the other, or it won't
+          // WARNING: matter.
         }
+      } else {
+        convMac->topOfChainMulConv = std::move(convMac2);
       }
-      return val;
-    };
-
-    // If broadcast ops' sources are from the same memref, sort the broadcast
-    // ops by an increasing order of memrefs' constant indices.
-    std::sort(bcastOpSourceVec.begin(), bcastOpSourceVec.end(),
-              [&](const Value &a, const Value &b) {
-                aievec::UPDOp bcastUPDOpA =
-                    cast<aievec::UPDOp>(a.getDefiningOp());
-                aievec::UPDOp bcastUPDOpB =
-                    cast<aievec::UPDOp>(b.getDefiningOp());
-                if (bcastUPDOpA.getSource() == bcastUPDOpB.getSource()) {
-                  return getConstantIdx(a) <= getConstantIdx(b);
-                }
-                return true;
-              });
-
-    unsigned maxGroupSize = resultElWidth == 16 ? 4 : 8;
-
-    // Legality check for the mul add chain, and collect the ops that can be
-    // transformed to mul_conv and mul_conv.
-    if (!collectFusedOps(maxGroupSize, dupFactor, bcastOpSourceVec,
-                         groupFusedOps, macChainMap)) {
-      canFoldMulAddChainToConvOp = false;
-      return;
     }
+    convMac->acc = acc;
+    return convMac;
+  }
 
-    if (std::any_of(groupFusedOps.begin(), groupFusedOps.end(),
-                    [](const auto &ops) { return ops.size() < 2; })) {
-      canFoldMulAddChainToConvOp = false;
+  LongestConvMACChainAnalysis(arith::AddIOp addOp) {
+    std::unique_ptr<ConvMac> macConvChainElem = getConvMacFromAddOp(addOp);
+    if (!macConvChainElem)
       return;
+
+    if (macConvChainElem->acc) {
+      auto upChainAddOp = macConvChainElem->acc.getDefiningOp<arith::AddIOp>();
+      if (upChainAddOp) {
+        auto &upChainChainAnalysis =
+            am->getChildAnalysis<LongestConvMACChainAnalysis>(upChainAddOp);
+        if (upChainChainAnalysis.convMacChain) {
+          convMacChain = std::move(upChainChainAnalysis.convMacChain);
+          convMacChain->push_back(std::move(macConvChainElem));
+          return;
+        }
+      }
     }
-    canFoldMulAddChainToConvOp = true;
+    assert(!convMacChain && "Convolution MAC chain unexpectedly not empty");
+    convMacChain = std::make_unique<ConvMacChain>();
+    if (macConvChainElem->topOfChainMulConv)
+      convMacChain->push_back(std::move(macConvChainElem->topOfChainMulConv));
+    convMacChain->push_back(std::move(macConvChainElem));
   }
-
-  MulDefMapTy macChainMap;
-  SmallVector<SmallVector<arith::MulIOp, 8>, 8> groupFusedOps;
-  unsigned dupFactor;
-  bool hasMulConv;
-  Value acc;
-  bool canFoldMulAddChainToConvOp;
 };
-
-// This conversion pattern folds a mul add chain into mul_conv and mac_conv
-// ops. We can handle the mul add chain with a random order.
+// HACK: For some reason, it's not possible to access the analysis manager from
+// HACK: within an analysis, but we need it to build the analysis recursively.
+// HACK: If there is a good reason not to do this, we should find an
+// HACK: alternative way to build the MAC chain.
+AnalysisManager *LongestConvMACChainAnalysis::am = nullptr;
+
+// This conversion pattern folds a MAC chain into mul_conv and mac_conv
+// ops. We can handle the mul MAC with a random order.
 struct FoldMulAddChainToConvOpPattern
     : public OpConversionPattern<arith::AddIOp> {
   using OpConversionPattern<arith::AddIOp>::OpConversionPattern;
@@ -399,152 +362,100 @@ struct FoldMulAddChainToConvOpPattern
   LogicalResult
   matchAndRewrite(arith::AddIOp srcOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    canFoldMulAddChainToConvOpAnalysis analysis =
-        am.getChildAnalysis<canFoldMulAddChainToConvOpAnalysis>(srcOp);
-    if (!analysis.canFoldMulAddChainToConvOp)
+    auto &convMacChainAnalysis =
+        am.getChildAnalysis<LongestConvMACChainAnalysis>(srcOp);
+    auto &convMacChain = convMacChainAnalysis.convMacChain;
+    if (!convMacChain)
       return failure();
 
-    SmallVector<SmallVector<arith::MulIOp, 8>, 8> groupFusedOps =
-        analysis.groupFusedOps;
-    MulDefMapTy macChainMap = analysis.macChainMap;
-    unsigned dupFactor = analysis.dupFactor;
-    bool hasMulConv = analysis.hasMulConv;
-    Value acc = analysis.acc;
-
-    for (auto fusedOps : groupFusedOps) {
-      arith::MulIOp mulOp = (*fusedOps.begin());
-
-      // Get the mul op's lhs and rhs defining ops. We keep splat op at rhs.
-      if (isa<aievec::BroadcastOp>(mulOp->getOperand(0).getDefiningOp())) {
-        Value left = mulOp->getOperand(0);
-        Value right = mulOp->getOperand(1);
-        mulOp->setOperand(0, right);
-        mulOp->setOperand(1, left);
-      }
-
-      Value lhs = mulOp->getOperand(0);
-      Value rhs = mulOp->getOperand(1);
-
-      VectorType vType = cast<VectorType>(mulOp.getResult().getType());
-      Type sType = vType.getElementType();
-      IntegerType iType = cast<IntegerType>(sType);
-      unsigned width = iType.getWidth() <= 8 ? 32 : 64;
-      int32_t M = iType.getWidth() == 8 ? 32 : 16;
-      int32_t N = iType.getWidth() == 8 ? 8 : 4;
-
-      Type ctype = mlir::IntegerType::get(iType.getContext(), width);
-      Type opType = VectorType::get(vType.getShape(), ctype);
-
-      aievec::BroadcastOp bcastOp =
-          cast<aievec::BroadcastOp>(rhs.getDefiningOp());
-      aievec::UPDOp bcastUPDOp =
-          cast<aievec::UPDOp>(bcastOp.getSource().getDefiningOp());
-      SmallVector<Value, 4> indices(bcastUPDOp.getIndices().begin(),
-                                    bcastUPDOp.getIndices().end());
-      unsigned lanes = 512 / getElementSizeInBits(vType);
-      VectorType resType = createVectorType(lanes, sType);
-      Value innerMostIdx = indices[indices.size() - 1];
-      Value newIdx = innerMostIdx;
-      int64_t val = -1;
-      int64_t defIdx = -1;
-      // Transfer
-      // %c32 = arith.constant 32 : index
-      // %1 = aievec.upd %arg1[%c32] {index = 0 : i8} : vector<32xi8>
-      // %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<32xi8>
-      // to -
-      // %c0 = arith.constant 0 : index
-      // %1 = aievec.upd %arg1[%c0] {index = 0 : i8} : vector<64xi8>
-      // %2 = aievec.broadcast %1 {idx = 32 : i8} : vector<32xi8>
-      if (auto idxDefOp = innerMostIdx.getDefiningOp()) {
-        if (auto constOp = dyn_cast<arith::ConstantOp>(idxDefOp)) {
-          val = cast<IntegerAttr>(constOp.getValue()).getInt();
-          if (val) {
-            defIdx = val / lanes * lanes;
-            val %= lanes;
-            newIdx = rewriter.create<arith::ConstantOp>(
-                constOp.getLoc(),
-                rewriter.getIntegerAttr(constOp.getType(), defIdx));
-            indices[indices.size() - 1] = newIdx;
-          }
-        }
+    auto loc = srcOp.getLoc();
+    VectorType vecTy = cast<VectorType>(srcOp.getResult().getType());
+    unsigned elemWidth = cast<IntegerType>(vecTy.getElementType()).getWidth();
+    unsigned accWidth = elemWidth <= 8 ? 32 : 64;
+    int32_t M = elemWidth == 8 ? 32 : 16;
+    int32_t N = elemWidth == 8 ? 8 : 4;
+
+    Type wideElemTy = IntegerType::get(getContext(), accWidth);
+    Type accVecTy = VectorType::get(vecTy.getShape(), wideElemTy);
+
+    const auto &groups = convMacChainAnalysis.getGroupsInChain();
+    Value grpAcc = (*convMacChain)[groups[0].fromIdx]->acc;
+    if (grpAcc)
+      grpAcc = rewriter
+                   .create<aievec::UPSOp>(srcOp.getLoc(), accVecTy, grpAcc,
+                                          /*shift=*/0)
+                   .getResult();
+    for (const auto &group : groups) {
+      Value grpLhs = (*convMacChain)[group.fromIdx]->lhs;
+      Value grpRhs = (*convMacChain)[group.fromIdx]->rhs;
+      auto filterVecTy = cast<VectorType>(grpRhs.getType());
+      auto signalVecTy = cast<VectorType>(grpLhs.getType());
+      // Sort out the vector used as filter
+      // If the length of the filter is half that of the signal, concatenate
+      // the filter with itself.
+      if (2 * filterVecTy.getShape()[0] == signalVecTy.getShape()[0])
+        grpRhs =
+            rewriter
+                .create<aievec::ConcatOp>(
+                    loc, signalVecTy, SmallVector<Value, 2>({grpRhs, grpRhs}))
+                .getResult();
+      // If the filter has duplicate elements, pack them.
+      if (group.bcastDist == 2)
+        grpRhs =
+            rewriter
+                .create<aievec::ShuffleOp>(loc, signalVecTy, grpRhs, /*mode=*/0)
+                .getResult();
+      // If the first element of the filter to be used is not 0, shift the
+      // filter to align the first element to the beginning.
+      if (group.bcastShift) {
+        int32_t shiftBytes =
+            group.bcastShift * getElementSizeInBits(filterVecTy) >>
+            (3 + group.bcastDist - 1);
+        auto shiftBytesCst =
+            rewriter
+                .create<arith::ConstantOp>(
+                    loc, rewriter.getI32IntegerAttr(shiftBytes))
+                .getResult();
+        grpRhs = rewriter
+                     .create<aievec::ShiftOp>(grpRhs.getDefiningOp()->getLoc(),
+                                              signalVecTy, grpRhs, grpRhs,
+                                              shiftBytesCst)
+                     .getResult();
       }
-
-      aievec::UPDOp newBcastOp = bcastUPDOp;
-
-      // Rewrite the upd op with maximum vector lanes
-      if (vType != resType) {
-        newBcastOp = rewriter.create<aievec::UPDOp>(
-            bcastUPDOp->getLoc(), resType, bcastUPDOp.getSource(), indices, 0,
-            0, TypedValue<VectorType>(nullptr));
-      }
-
-      // Since we do not need to use duplicated data like in AIE1, if a
-      // dup-factor exists, we extract the identical data by shuffle op. We use
-      // mode 0 to extract the elements with even indices for i8 type data.
-      Operation *shuffleOp = newBcastOp;
-      if (dupFactor != 1) {
-        shuffleOp = rewriter.create<aievec::ShuffleOp>(
-            newBcastOp.getLoc(), resType, newBcastOp.getResult(), 0);
+      // Sort out the vector used as signal
+      // If the signal to be convolved doesn't start at element 0, shift the
+      // signal to align the first element to the beginning.
+      if (group.signalShift) {
+        int32_t shiftBytes =
+            group.signalShift * getElementSizeInBits(signalVecTy) >> 3;
+        auto shiftBytesCst =
+            rewriter
+                .create<arith::ConstantOp>(
+                    loc, rewriter.getI32IntegerAttr(shiftBytes))
+                .getResult();
+        grpLhs = rewriter
+                     .create<aievec::ShiftOp>(loc, signalVecTy, grpLhs, grpLhs,
+                                              shiftBytesCst)
+                     .getResult();
       }
-
-      int32_t shiftBytes = (bcastOp.getIdx() + val) *
-                           getElementSizeInBits(vType) / 8 / dupFactor;
-
-      rhs = shuffleOp->getResult(0);
-
-      // Generate a shift_bytes operation for rhs if the start position is not
-      // 0.
-      if (shiftBytes) {
-        arith::ConstantOp constOp = rewriter.create<arith::ConstantOp>(
-            shuffleOp->getLoc(), rewriter.getI32IntegerAttr(shiftBytes));
-        rhs = rewriter.create<aievec::ShiftOp>(
-            shuffleOp->getLoc(),
-            cast<VectorType>(shuffleOp->getResult(0).getType()),
-            shuffleOp->getResult(0), shuffleOp->getResult(0),
-            constOp.getResult());
-      }
-
-      aievec::UPDOp lUPDOp = cast<aievec::UPDOp>(lhs.getDefiningOp());
-      SmallVector<Value, 8> lIndices;
-      lIndices.append(lUPDOp.getIndices().begin(), lUPDOp.getIndices().end());
-
-      lhs = rewriter.create<aievec::UPDOp>(lUPDOp->getLoc(), resType,
-                                           lUPDOp.getSource(), lIndices, 0, 0,
-                                           TypedValue<VectorType>(nullptr));
-
-      if (!hasMulConv && acc.getType() != opType) {
-        auto upsOp = rewriter.create<aievec::UPSOp>(
-            acc.getDefiningOp()->getLoc(), opType, acc, shiftParam);
-        acc = upsOp->getResult(0);
-      }
-
-      Operation *convOp = nullptr;
-      if (fusedOps == groupFusedOps.back()) {
-        if (hasMulConv) {
-          convOp = rewriter.create<aievec::MulConvOp>(srcOp->getLoc(), opType,
-                                                      lhs, rhs, M, N);
-          hasMulConv = false;
-        } else {
-          convOp = rewriter.create<aievec::FMAConvOp>(
-              srcOp->getLoc(), opType, lhs, rhs, acc, M, N, false);
-        }
-        rewriter.replaceOpWithNewOp<aievec::SRSOp>(
-            srcOp, vType, convOp->getResult(0), shiftParam);
-        return success();
-      } else {
-        if (hasMulConv) {
-          convOp = rewriter.create<aievec::MulConvOp>(srcOp->getLoc(), opType,
-                                                      lhs, rhs, M, N);
-          hasMulConv = false;
-        } else {
-          convOp = rewriter.create<aievec::FMAConvOp>(
-              srcOp->getLoc(), opType, lhs, rhs, acc, M, N, false);
-        }
-      }
-      acc = convOp->getResult(0);
+      // Generate a convolution operation for the group
+      // If there is no upchain accumulator, use a mul_conv; use a mac_conv
+      // otherwise.
+      if (!grpAcc)
+        grpAcc = rewriter
+                     .create<aievec::MulConvOp>(srcOp.getLoc(), accVecTy,
+                                                grpLhs, grpRhs, M, N)
+                     .getResult();
+      else
+        grpAcc =
+            rewriter
+                .create<aievec::FMAConvOp>(srcOp.getLoc(), accVecTy, grpLhs,
+                                           grpRhs, grpAcc, M, N, false)
+                .getResult();
     }
-
-    llvm_unreachable("the conversion should end with srs op.");
+    rewriter.replaceOpWithNewOp<aievec::SRSOp>(srcOp, vecTy, grpAcc,
+                                               shiftParam);
+    return success();
   }
 
   AnalysisManager &am;
@@ -553,11 +464,12 @@ struct FoldMulAddChainToConvOpPattern
 
 void configureAIEVecConvOpTransformationLegalizations(ConversionTarget &target,
                                                       AnalysisManager &am) {
-  target.addLegalDialect<xilinx::aievec::AIEVecDialect>();
+  LongestConvMACChainAnalysis::am = &am;
+  target.addLegalDialect<AIEVecDialect>();
   target.addLegalDialect<arith::ArithDialect>();
   target.addDynamicallyLegalOp<arith::AddIOp>([&am](arith::AddIOp op) {
-    return !am.getChildAnalysis<canFoldMulAddChainToConvOpAnalysis>(op)
-                .canFoldMulAddChainToConvOp;
+    auto &convAnalysis = am.getChildAnalysis<LongestConvMACChainAnalysis>(op);
+    return !convAnalysis.canChainBeReplacedWithConvOps();
   });
 }
 
@@ -567,3 +479,80 @@ void populateAIEVecConvOpTransformationPatterns(RewritePatternSet &patterns,
   patterns.add<FoldMulAddChainToConvOpPattern>(patterns.getContext(), am,
                                                shiftParam);
 }
+
+struct AIEVecConvAnalysis : public AIEVecConvAnalysisBase<AIEVecConvAnalysis> {
+  AIEVecConvAnalysis() = default;
+  using ConvMacChain = LongestConvMACChainAnalysis::ConvMacChain;
+  using ConvMacChainGroupList =
+      LongestConvMACChainAnalysis::ConvMacChainGroupList;
+
+  void runOnOperation() override {
+    markAllAnalysesPreserved();
+    AnalysisManager am = getAnalysisManager();
+    LongestConvMACChainAnalysis::am = &am;
+    func::FuncOp func = getOperation();
+
+    // Compute all the chains
+    func.walk([&](arith::AddIOp addOp) {
+      if (isa<VectorType>(addOp.getResult().getType()))
+        am.getChildAnalysis<LongestConvMACChainAnalysis>(addOp);
+    });
+
+    // Sort the chains, ready to split by group
+    func.walk([&](arith::AddIOp addOp) {
+      if (isa<VectorType>(addOp.getResult().getType())) {
+        auto &analysis =
+            am.getChildAnalysis<LongestConvMACChainAnalysis>(addOp);
+        if (analysis.convMacChain)
+          analysis.sortChain();
+      }
+    });
+
+    if (printResult) {
+      func.walk([&](arith::AddIOp addOp) {
+        if (isa<VectorType>(addOp.getResult().getType())) {
+          auto &macChainAnalysis =
+              am.getChildAnalysis<LongestConvMACChainAnalysis>(addOp);
+          if (macChainAnalysis.canChainBeReplacedWithConvOps()) {
+            addOp.print(llvm::outs());
+            llvm::outs() << " is at the end of a convolution MAC Chain:\n";
+            listChain(macChainAnalysis.convMacChain,
+                      macChainAnalysis.getGroupsInChain());
+          }
+        }
+      });
+    }
+  }
+
+  void listChain(const std::unique_ptr<ConvMacChain> &chain,
+                 const ConvMacChainGroupList &groups) const {
+    uint64_t gIdx = 0;
+    for (const auto &group : groups) {
+      llvm::outs() << "-------------- GROUP " << std::to_string(gIdx)
+                   << " --------------\n";
+      llvm::outs() << "  Signal Shift: " << std::to_string(group.signalShift)
+                   << "   Kernel Shift: " << std::to_string(group.bcastShift)
+                   << "   Kernel Duplication: "
+                   << std::to_string(group.bcastDist) << "\n";
+      for (uint64_t i = group.fromIdx; i < group.toIdx; i++) {
+        auto shift = (*chain)[i]->shift;
+        auto bcastIdx = (*chain)[i]->bcastIdx;
+        auto lhsOp = (*chain)[i]->lhs.getDefiningOp();
+        auto rhsOp = (*chain)[i]->rhs.getDefiningOp();
+        if (!(*chain)[i]->acc)
+          llvm::outs() << "  [mul_conv]\n";
+        llvm::outs() << "    [Shift: " << std::to_string(shift) << "]: ";
+        lhsOp->print(llvm::outs());
+        llvm::outs() << "\n    [Bcast: " << std::to_string(bcastIdx) << "]: ";
+        rhsOp->print(llvm::outs());
+        llvm::outs() << "\n";
+      }
+      gIdx++;
+    }
+    llvm::outs() << "-------------------------------------\n";
+  }
+};
+
+std::unique_ptr<Pass> xilinx::aievec::createAIEVecConvolutionAnalysisPass() {
+  return std::make_unique<AIEVecConvAnalysis>();
+}
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 0f79414a9e..f3a6c11d88 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <bitset>
 #include <optional>
 #include <tuple>
 
@@ -16,6 +17,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 #include "VectorToAIEVecConversions.h"
 
@@ -31,6 +33,50 @@ using namespace xilinx::aievec;
 // Utility functions
 //===----------------------------------------------------------------------===//
 
+// Return the offset of a given transfer read operation with regards to the
+// specified vector type. If the read is aligned to the specified alignment
+// parameter (in bits), then the offset is 0. Otherwise, the offset is the
+// number of elements past the immediately preceding aligned vector length.
+template <
+    typename TransferReadLikeOp,
+    typename = std::enable_if_t<
+        std::is_same_v<TransferReadLikeOp, vector::TransferReadOp> ||
+        std::is_same_v<TransferReadLikeOp, vector::TransferReadOp::Adaptor>>>
+static int64_t getTransferReadAlignmentOffset(TransferReadLikeOp readOp,
+                                              VectorType vType,
+                                              int64_t alignment) {
+  // TODO: Add support for cases where the index is not comming from an
+  // TODO: `affine.apply` op or when the affine map has more than one
+  // TODO: dimension. We also need to address the case where the index is an
+  // TODO: induction variable.
+  auto innerMostIndex = readOp.getIndices().back();
+  auto vectorLength = vType.getShape().back();
+  auto idxDefOp = innerMostIndex.getDefiningOp();
+  if (!idxDefOp)
+    return 0L;
+  int64_t vectorLengthAlignmentOffset =
+      TypeSwitch<Operation *, int64_t>(idxDefOp)
+          .Case<arith::ConstantOp>([&](auto constantOp) {
+            return cast<IntegerAttr>(constantOp.getValue()).getInt() %
+                   vectorLength;
+          })
+          .template Case<AffineApplyOp>([&](auto applyOp) {
+            if (applyOp.getAffineMap().getNumDims() == 1)
+              return applyOp.getAffineMap().compose(ArrayRef<int64_t>{0})[0] %
+                     vectorLength;
+            return 0L;
+          })
+          .Default([&](auto) {
+            // XXX: If we can't determine the offset, we assume the access is
+            // XXX: aligned.
+            return 0L;
+          });
+  int64_t absoluteAlignmentOffset = alignment / getElementSizeInBits(vType);
+  if (vectorLengthAlignmentOffset % absoluteAlignmentOffset)
+    return vectorLengthAlignmentOffset;
+  return 0;
+}
+
 // Given the LHS and RHS of an `arith::AddIOp`, if one of them is defined by an
 // `arith::MulIOp`, return a tuple with the `lhs`, `rhs`, and `acc` of the MAC
 // operation that can replace them.
@@ -87,6 +133,73 @@ static aievec::MulElemOp createMulElemAieML(ConversionPatternRewriter &rewriter,
   return mulElemOp;
 }
 
+// Return the list of attributes that configure an `aievec.select` op to
+// perform a rotation of the input vector by `rotation` number of elements.
+// The attribute values depend on the vector type of the select operation.
+static SmallVector<NamedAttribute>
+buildAttributeListForRotationSelectOp(PatternRewriter &rewriter, VectorType vTy,
+                                      int64_t rotation) {
+  unsigned width = 0;
+  auto elemTy = vTy.getElementType();
+  auto intTy = dyn_cast<IntegerType>(elemTy);
+  if (intTy)
+    width = intTy.getWidth();
+  StringAttr attr0 = rewriter.getStringAttr("0");
+  StringAttr attr0x06040200 = rewriter.getStringAttr("0x06040200");
+  StringAttr attr0x0e0c0a08 = rewriter.getStringAttr("0x0e0c0a08");
+  StringAttr attr0x2103 = rewriter.getStringAttr("0x2103");
+  StringAttr attr0x3210 = rewriter.getStringAttr("0x3210");
+  StringAttr selectAttrName = rewriter.getStringAttr("select");
+  StringAttr xoffsetsAttrName = rewriter.getStringAttr("xoffsets");
+  StringAttr xoffsetsHiAttrName = rewriter.getStringAttr("xoffsets_hi");
+  StringAttr xsquareAttrName = rewriter.getStringAttr("xsquare");
+  StringAttr xstartAttrName = rewriter.getStringAttr("xstart");
+  StringAttr yoffsetsAttrName = rewriter.getStringAttr("yoffsets");
+  StringAttr yoffsetsHiAttrName = rewriter.getStringAttr("yoffsets_hi");
+  StringAttr ysquareAttrName = rewriter.getStringAttr("ysquare");
+  StringAttr ystartAttrName = rewriter.getStringAttr("ystart");
+
+  switch (width) {
+  case 16:
+    if (rotation % 2) {
+      int64_t xstart = rotation + 1;
+      int64_t ystart = rotation - 1;
+      return SmallVector<NamedAttribute, 9>(
+          {{selectAttrName, rewriter.getStringAttr("0x11111111")},
+           {xoffsetsAttrName, attr0x06040200},
+           {xoffsetsHiAttrName, attr0x0e0c0a08},
+           {xsquareAttrName, attr0x2103},
+           {xstartAttrName, rewriter.getStringAttr(std::to_string(xstart))},
+           {yoffsetsAttrName, rewriter.getStringAttr("0x0503010f")},
+           {yoffsetsHiAttrName, rewriter.getStringAttr("0x0d0b0907")},
+           {ysquareAttrName, attr0x2103},
+           {ystartAttrName, rewriter.getStringAttr(std::to_string(ystart))}});
+    } else {
+      return SmallVector<NamedAttribute, 9>(
+          {{selectAttrName, attr0},
+           {xoffsetsAttrName, attr0x06040200},
+           {xoffsetsHiAttrName, attr0x0e0c0a08},
+           {xsquareAttrName, attr0x3210},
+           {xstartAttrName, rewriter.getStringAttr(std::to_string(rotation))},
+           {yoffsetsAttrName, attr0},
+           {yoffsetsHiAttrName, attr0},
+           {ysquareAttrName, attr0},
+           {ystartAttrName, attr0}});
+    }
+    break;
+  case 32:
+    return SmallVector<NamedAttribute, 7>(
+        {{selectAttrName, attr0},
+         {xoffsetsAttrName, rewriter.getStringAttr("0x76543210")},
+         {xsquareAttrName, attr0x3210},
+         {xstartAttrName, rewriter.getStringAttr(std::to_string(rotation))},
+         {yoffsetsAttrName, attr0},
+         {ysquareAttrName, attr0},
+         {ystartAttrName, attr0}});
+  }
+  return {};
+}
+
 namespace xilinx {
 namespace aievec {
 
@@ -301,8 +414,9 @@ struct UPDOpEffectiveAccessSizeAnalysis {
 };
 
 //===----------------------------------------------------------------------===//
-// Lowering patterns
+// Rewrite patterns
 //===----------------------------------------------------------------------===//
+
 // This pattern fold `vector.extract` and `vector.broadcast` into
 // `aievec.broadcast` for aie-ml
 struct FoldVectorExtractAndBroadcastToAIEBroadcast
@@ -314,17 +428,28 @@ struct FoldVectorExtractAndBroadcastToAIEBroadcast
                   ConversionPatternRewriter &rewriter) const override {
 
     auto extOp =
-        dyn_cast<vector::ExtractOp>(bcastOp.getSource().getDefiningOp());
+        dyn_cast<vector::ExtractOp>(adaptor.getSource().getDefiningOp());
 
     if (!extOp)
       return failure();
 
     auto src = extOp.getVector();
     auto pos = extOp.getPosition();
-    VectorType resultType = bcastOp.getResult().getType().cast<VectorType>();
-
-    rewriter.replaceOpWithNewOp<aievec::BroadcastOp>(
-        bcastOp, resultType, src, cast<IntegerAttr>(pos[0]).getInt());
+    int64_t posVal = cast<IntegerAttr>(pos[0]).getInt();
+    VectorType srcVecType = cast<VectorType>(src.getType());
+    VectorType resultType = cast<VectorType>(bcastOp.getResult().getType());
+    if (srcVecType != resultType) {
+      if (srcVecType.getNumElements() != 2 * resultType.getNumElements())
+        return failure();
+      int8_t half = static_cast<int8_t>(posVal / resultType.getNumElements());
+      posVal -= half * resultType.getNumElements();
+      src = rewriter
+                .create<aievec::ExtOp>(extOp.getLoc(), resultType, src,
+                                       rewriter.getI8IntegerAttr(half))
+                .getResult();
+    }
+    rewriter.replaceOpWithNewOp<aievec::BroadcastOp>(bcastOp, resultType, src,
+                                                     posVal);
 
     return success();
   }
@@ -659,14 +784,15 @@ struct LowerVectorTransferReadToAIEUPD
   using OpConversionPattern<vector::TransferReadOp>::OpConversionPattern;
 
   LowerVectorTransferReadToAIEUPD(MLIRContext *context, AnalysisManager &am,
-                                  int32_t maxVectorSize = 256)
+                                  int64_t minVectorSize, int64_t maxVectorSize,
+                                  int64_t alignment, int64_t maxLoadSize)
       : OpConversionPattern<vector::TransferReadOp>(context), am(am),
-        maxVectorSize(maxVectorSize) {}
+        minVectorSize(minVectorSize), maxVectorSize(maxVectorSize),
+        vectorAlignment(alignment), maxLoadSize(maxLoadSize) {}
 
   LogicalResult
   matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // == Handle invalid read operations ==
     // Masked loads
     if (readOp.getMask())
       return readOp.emitError() << "AIE doesn't support masked loads.";
@@ -680,75 +806,43 @@ struct LowerVectorTransferReadToAIEUPD
     if (map.isConstant())
       return failure();
 
-    // When a transfer read with a constant innermost index is not aligned, we
-    // get the corresponding aligned load followed by an aievec.shift op.
-    // Example:
-    // Convert -
-    // %0 = vector.transfer_read %arg1[16] : vector<32xi8>
-    // %1 = vector.transfer_read %arg1[34] : vector<32xi8>
-    //
-    // to -
-    //
-    // %0 = aievec.upd %arg1[0] : vector<32xi8>
-    // %1 = aievec.upd %arg1[32] : vector<32xi8>
-    // %2 = aievec.shift %0, %1 {shift = 16 : i32} : vector<32xi8>
-    // %3 = aievec.upd %arg1[64] : vector<32xi8>
-    // %4 = aievec.shift %2, %3 {shift = 2 : i32} : vector<32xi8>
-    //
-    SmallVector<Value, 4> indices(adaptor.getIndices().begin(),
-                                  adaptor.getIndices().end());
-    Value innerMostIdx = indices[indices.size() - 1];
-    Value newIdx = innerMostIdx;
-    VectorType vType = readOp.getVector().getType().cast<VectorType>();
-    int32_t lanes = getVectorLaneSize(vType);
-
-    if (auto defOp = innerMostIdx.getDefiningOp()) {
-      if (auto constOp = dyn_cast<arith::ConstantOp>(defOp)) {
-        int64_t val = constOp.getValue().cast<IntegerAttr>().getInt();
-        if (val) {
-          int64_t offset = val % lanes;
-          int64_t idx = val / lanes * lanes;
-          newIdx = rewriter.create<arith::ConstantOp>(
-              constOp.getLoc(),
-              rewriter.getIntegerAttr(constOp.getType(), idx));
-          indices[indices.size() - 1] = newIdx;
-          int32_t shiftBytes = offset * getElementSizeInBits(vType) / 8;
-
-          if (shiftBytes) {
-            auto updOp = rewriter.create<xilinx::aievec::UPDOp>(
-                readOp.getLoc(), vType, adaptor.getSource(), indices, 0, 0,
-                TypedValue<VectorType>(nullptr));
-            newIdx = rewriter.create<arith::ConstantOp>(
-                constOp.getLoc(),
-                rewriter.getIntegerAttr(constOp.getType(), idx + lanes));
-            indices[indices.size() - 1] = newIdx;
-            // Load the next vector lanes
-            auto nextUpdOp = rewriter.create<xilinx::aievec::UPDOp>(
-                readOp.getLoc(), vType, adaptor.getSource(), indices, 0, 0,
-                TypedValue<VectorType>(nullptr));
-
-            arith::ConstantOp constOp = rewriter.create<arith::ConstantOp>(
-                readOp.getLoc(), rewriter.getI32IntegerAttr(shiftBytes));
-            rewriter.replaceOpWithNewOp<xilinx::aievec::ShiftOp>(
-                readOp, vType, updOp->getResult(0), nextUpdOp->getResult(0),
-                constOp.getResult());
-          } else {
-            rewriter.replaceOpWithNewOp<xilinx::aievec::UPDOp>(
-                readOp, vType, adaptor.getSource(), indices, 0, 0,
-                TypedValue<VectorType>(nullptr));
-          }
-          return success();
-        }
-      }
-    }
-    rewriter.replaceOpWithNewOp<xilinx::aievec::UPDOp>(
-        readOp, vType, adaptor.getSource(), indices, 0, 0,
+    // Misaligned accesses
+    auto vType = readOp.getVectorType();
+    if (getTransferReadAlignmentOffset(adaptor, vType, vectorAlignment) != 0)
+      return failure();
+
+    // Invalid vector size.
+    // We can handle cases where the vector size is:
+    //   1) the minimum vector size
+    //   2) a square multiple of the alignment size and up to the maximum
+    //      vector size.
+    int64_t vSize = vType.getNumElements() * vType.getElementTypeBitWidth();
+    if (vSize > maxVectorSize ||
+        (vSize % vectorAlignment && vSize != minVectorSize))
+      return failure();
+    // We can deal with linked update instructions when the vector size is
+    // exactly twice the load size. This could change in future architectures
+    if (vSize > maxLoadSize && vSize != maxLoadSize * 2)
+      return failure();
+    int64_t multiplicity = vSize / vectorAlignment;
+    if ((vSize > minVectorSize) && std::bitset<8>(multiplicity).count() != 1)
+      return failure();
+
+    auto updOp = rewriter.create<xilinx::aievec::UPDOp>(
+        readOp.getLoc(), vType, adaptor.getSource(), adaptor.getIndices(), 0, 0,
         TypedValue<VectorType>(nullptr));
+    if (vSize > maxLoadSize) {
+      updOp = rewriter.create<xilinx::aievec::UPDOp>(
+          readOp.getLoc(), vType, adaptor.getSource(), adaptor.getIndices(),
+          maxLoadSize, 1, updOp.getResult());
+    }
+    rewriter.replaceOp(readOp, updOp.getResult());
+
     return success();
   }
 
   AnalysisManager &am;
-  int32_t maxVectorSize;
+  int64_t minVectorSize, maxVectorSize, vectorAlignment, maxLoadSize;
 };
 
 // XXX: Notice that this template doesn't verify that the vector element type
@@ -1440,36 +1534,151 @@ struct LowerVectorReductionAddBfloat16Op
   }
 };
 
-// If a UPD op is loading a vector twice the size of the architecture
-// vector size, split it into a high and low load into the accumulator.
-// TODO: This is a process we may want to include as part of the
-// TODO: legalization of `vector.transfer_read`.
-struct SplitUPDOpOnAccPattern : public OpConversionPattern<aievec::UPDOp> {
+// Convert a `vector.extract_strided_slice` op on 1D vectors into an
+// `aievec.select` + `aievec.ext` op.
+struct LowerVectorExtractStridedSliceOpAIEv1Pattern
+    : public OpConversionPattern<vector::ExtractStridedSliceOp> {
+  using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ExtractStridedSliceOp extractOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto vType = extractOp.getVectorType();
+    if (vType.getRank() != 1)
+      return failure();
+
+    int64_t stride = cast<IntegerAttr>(adaptor.getStrides()[0]).getInt();
+    if (stride != 1)
+      return failure();
+
+    // AIE doesn't support select operations on i8
+    if (getElementSizeInBits(vType) == 8)
+      return extractOp.emitError()
+             << "AIEv1 doesn't support select ops on int8 types";
+
+    // We only accept the case where we are extracting a slice half the size of
+    // the input vector.
+    int64_t size = cast<IntegerAttr>(adaptor.getSizes()[0]).getInt();
+    if (vType.getNumElements() != 2 * size)
+      return failure();
+
+    int64_t offset = cast<IntegerAttr>(adaptor.getOffsets()[0]).getInt();
+    auto selectOp = rewriter.create<aievec::SelectOp>(
+        extractOp.getLoc(), vType, adaptor.getVector(),
+        buildAttributeListForRotationSelectOp(rewriter, vType, offset));
+    rewriter.replaceOpWithNewOp<aievec::ExtOp>(extractOp, extractOp.getType(),
+                                               selectOp.getResult(),
+                                               rewriter.getI8IntegerAttr(0));
+
+    return success();
+  }
+};
+
+// Convert a `vector.extract_strided_slice` op on 1D vectors into an
+// `aievec.shift` op.
+struct LowerVectorExtractStridedSliceOpAIEMLPattern
+    : public OpConversionPattern<vector::ExtractStridedSliceOp> {
+  using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ExtractStridedSliceOp extractOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto vType = cast<VectorType>(adaptor.getVector().getType());
+    if (vType.getRank() != 1)
+      return failure();
+
+    int64_t stride = cast<IntegerAttr>(adaptor.getStrides()[0]).getInt();
+    if (stride != 1)
+      return failure();
+
+    // We only accept the case where we are extracting a slice half the size of
+    // the input vector.
+    int64_t size = cast<IntegerAttr>(adaptor.getSizes()[0]).getInt();
+    if (vType.getNumElements() != 2 * size)
+      return failure();
+
+    auto shortVecType = cast<VectorType>(extractOp.getResult().getType());
+    auto bottomHalf = rewriter
+                          .create<aievec::ExtOp>(
+                              extractOp.getLoc(), shortVecType,
+                              adaptor.getVector(), rewriter.getI8IntegerAttr(0))
+                          .getResult();
+    auto topHalf = rewriter
+                       .create<aievec::ExtOp>(extractOp.getLoc(), shortVecType,
+                                              adaptor.getVector(),
+                                              rewriter.getI8IntegerAttr(1))
+                       .getResult();
+    int64_t offset = cast<IntegerAttr>(adaptor.getOffsets()[0]).getInt();
+    int32_t shiftBytes = offset * getElementSizeInBits(vType) / 8;
+    auto shiftBytesConstOp = rewriter.create<arith::ConstantOp>(
+        extractOp.getLoc(), rewriter.getIntegerType(32),
+        rewriter.getI32IntegerAttr(shiftBytes));
+    rewriter.replaceOpWithNewOp<aievec::ShiftOp>(
+        extractOp, shortVecType, bottomHalf, topHalf, shiftBytesConstOp);
+
+    return success();
+  }
+};
+
+// Replaces a short UPD op with a wide one followed by an ext op of the bottom
+// half.
+struct ExpandUPDToUPDAndExtPattern : public OpConversionPattern<aievec::UPDOp> {
   using OpConversionPattern<aievec::UPDOp>::OpConversionPattern;
 
-  SplitUPDOpOnAccPattern(MLIRContext *context, AnalysisManager &am,
-                         int32_t maxVectorSize = 256)
-      : OpConversionPattern<aievec::UPDOp>(context), am(am),
-        maxVectorSize(maxVectorSize) {}
+  ExpandUPDToUPDAndExtPattern(MLIRContext *context)
+      : OpConversionPattern<aievec::UPDOp>(context) {}
 
   LogicalResult
   matchAndRewrite(aievec::UPDOp updOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (am.getChildAnalysis<UPDOpEffectiveAccessSizeAnalysis>(updOp)
-            .effectiveSize < 2 * static_cast<unsigned>(maxVectorSize))
+    // Verify that we haven't already expanded this one
+    if (updOp->hasOneUse() && isa<aievec::ExtOp>(*updOp->getUsers().begin()))
       return failure();
 
-    auto updOp0 = rewriter.create<aievec::UPDOp>(
-        updOp.getLoc(), updOp.getResult().getType(), adaptor.getSource(),
-        adaptor.getIndices(), 0, 0);
-    rewriter.replaceOpWithNewOp<aievec::UPDOp>(
-        updOp, updOp.getResult().getType(), adaptor.getSource(),
-        adaptor.getIndices(), 2 * maxVectorSize, 1, updOp0.getResult());
+    auto vecType = cast<VectorType>(updOp.getType());
+    SmallVector<int64_t, 4> vecShape(vecType.getShape().begin(),
+                                     vecType.getShape().end());
+    vecShape[vecType.getRank() - 1] *= 2;
+    auto longVecType = VectorType::get(vecShape, vecType.getElementType());
+    auto newUpdOp = rewriter.create<aievec::UPDOp>(
+        updOp.getLoc(), longVecType, adaptor.getSource(), adaptor.getIndices(),
+        adaptor.getOffset(), adaptor.getIndex(), adaptor.getVector());
+    rewriter.replaceOpWithNewOp<aievec::ExtOp>(
+        updOp, vecType, newUpdOp.getResult(), rewriter.getI8IntegerAttr(0));
+
     return success();
   }
+};
 
-  AnalysisManager &am;
-  int32_t maxVectorSize;
+// Replaces a wide UPD op followed by an ext op of the bottom half with a short
+// UPD op.
+struct FuseExtIntoUPDPattern : public OpConversionPattern<aievec::ExtOp> {
+  using OpConversionPattern<aievec::ExtOp>::OpConversionPattern;
+
+  FuseExtIntoUPDPattern(MLIRContext *context)
+      : OpConversionPattern<aievec::ExtOp>(context) {}
+
+  LogicalResult
+  matchAndRewrite(aievec::ExtOp extOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Verify we are extracting the lower half...
+    if (extOp.getIndex() != 0)
+      return failure();
+    // ...of a UPDOp
+    auto updOp = dyn_cast<aievec::UPDOp>(extOp.getSource().getDefiningOp());
+    if (!updOp)
+      return failure();
+
+    // Verify that this is a direct upd -> ext pattern
+    if (!updOp->hasOneUse())
+      return failure();
+
+    rewriter.replaceOpWithNewOp<aievec::UPDOp>(
+        extOp, extOp.getType(), updOp.getSource(), updOp.getIndices(),
+        updOp.getOffset(), updOp.getIndex(), updOp.getVector());
+
+    return success();
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -1478,19 +1687,19 @@ struct SplitUPDOpOnAccPattern : public OpConversionPattern<aievec::UPDOp> {
 
 static void populateAIEVecV1ConversionPatterns(RewritePatternSet &patterns,
                                                AnalysisManager &am) {
-  patterns.add<LowerVectorTransferReadToAIEUPD, SplitUPDOpOnAccPattern>(
-      patterns.getContext(), am, 256);
+  patterns.add<LowerVectorTransferReadToAIEUPD>(patterns.getContext(), am, 128,
+                                                512, 128, 256);
   patterns
       .add<LowerVectorAddFOpToAIEVecAddOp, LowerVectorSubIOpToAIEVecSubOp,
            LowerVectorSubFOpToAIEVecSubOp, ConvertMulAddToAIEVecFMAOpPattern,
-           FoldBroadcastToFMAOp, LowerVectorAddIOpToAIEVecAddOp>(
-          patterns.getContext());
+           FoldBroadcastToFMAOp, LowerVectorAddIOpToAIEVecAddOp,
+           LowerVectorExtractStridedSliceOpAIEv1Pattern>(patterns.getContext());
 }
 
 static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
                                                AnalysisManager &am) {
-  patterns.add<LowerVectorTransferReadToAIEUPD, SplitUPDOpOnAccPattern>(
-      patterns.getContext(), am, 512);
+  patterns.add<LowerVectorTransferReadToAIEUPD>(patterns.getContext(), am, 128,
+                                                1024, 256, 1024);
 
   patterns.add<
       LowerVectorAddIOpToAIEVecAddElemOp, LowerVectorAddFOpToAIEVecAddElemOp,
@@ -1503,8 +1712,8 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
       LowerVectorReductionAddFloatOp, LowerVectorReductionAddBfloat16Op,
       FoldVectorExtractAndBroadcastToAIEBroadcast,
       ConvertMulAddToAIEVecFMAElemOpPattern,
-      ConvertMulIToAIEVecMulElemOpPattern, ConvertMulFToAIEVecMulElemOpPattern>(
-      patterns.getContext());
+      ConvertMulIToAIEVecMulElemOpPattern, ConvertMulFToAIEVecMulElemOpPattern,
+      LowerVectorExtractStridedSliceOpAIEMLPattern>(patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1512,11 +1721,11 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
 //===----------------------------------------------------------------------===//
 
 // TODO: Review the validity of these legalizations beyond basic cases.
-
 static void configureAIEVecCommonLegalizations(ConversionTarget &target,
                                                AnalysisManager &am) {
   target.addLegalDialect<xilinx::aievec::AIEVecDialect, arith::ArithDialect>();
   target.addIllegalOp<vector::TransferReadOp>();
+  target.addIllegalOp<vector::ExtractStridedSliceOp>();
   target.addDynamicallyLegalOp<arith::AddIOp>(
       [](arith::AddIOp op) { return !isa<VectorType>(op.getType()); });
   target.addDynamicallyLegalOp<arith::AddFOp>(
@@ -1529,10 +1738,6 @@ static void configureAIEVecCommonLegalizations(ConversionTarget &target,
 
 static void configureAIEVecV1Legalizations(ConversionTarget &target,
                                            AnalysisManager &am) {
-  target.addDynamicallyLegalOp<aievec::UPDOp>([&am](xilinx::aievec::UPDOp op) {
-    return am.getChildAnalysis<UPDOpEffectiveAccessSizeAnalysis>(op)
-               .effectiveSize <= 512;
-  });
   target.addDynamicallyLegalOp<aievec::FMAOp>([](xilinx::aievec::FMAOp op) {
     auto lhsDefOp = op.getLhs().getDefiningOp();
     aievec::ConcatOp concatOp = nullptr;
@@ -1563,10 +1768,6 @@ static void configureAIEVecV1Legalizations(ConversionTarget &target,
 static void configureAIEVecV2Legalizations(ConversionTarget &target,
                                            AnalysisManager &am) {
   target.addLegalOp<UnrealizedConversionCastOp>();
-  target.addDynamicallyLegalOp<aievec::UPDOp>([&am](aievec::UPDOp op) {
-    return am.getChildAnalysis<UPDOpEffectiveAccessSizeAnalysis>(op)
-               .effectiveSize <= 1024;
-  });
 
   // A set recording the vector lane size and element width supported
   llvm::SmallSet<std::pair<unsigned, unsigned>, 16> laneSizeElWidthPairSet;
@@ -1862,6 +2063,60 @@ createLowerVectorToAIEVec(const LowerVectorToAIEVecOptions &options) {
   return std::make_unique<LowerVectorToAIEVec>(options);
 }
 
+//===---------------------------------------------------------------------------
+// Custom canonicalization passes
+//===---------------------------------------------------------------------------
+
+// This pass widens UPD ops to twice the width followed by an ext op of the
+// bottom half. This can be used together with SimplifyUPDOpsPass to find
+// additional common subexpressions with UPDs generated from unaligned
+// `transfer_read` ops.
+struct ExtendUPDOpsPass
+    : public PassWrapper<ExtendUPDOpsPass, OperationPass<func::FuncOp>> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    ConversionTarget target(*context);
+    patterns.add<ExpandUPDToUPDAndExtPattern>(patterns.getContext());
+    target.addLegalDialect<aievec::AIEVecDialect>();
+    target.addDynamicallyLegalOp<aievec::UPDOp>([](aievec::UPDOp op) {
+      return op.getVector() ||
+             (op->hasOneUse() && isa<aievec::UPDOp>(*op->getUsers().begin())) ||
+             llvm::all_of(op->getUsers(),
+                          [](Operation *op) { return isa<aievec::ExtOp>(op); });
+    });
+    auto func = getOperation();
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+// This pass replaces wide UPD ops that are only used by a single ext op of the
+// bottom half. This pass undos the work of ExtendUPDOpsPass.
+// TODO: This pass can be extended to work with wide UPD ops that are used by
+// TODO: a single ext op of the top half, which might be a good opportunity to
+// TODO: further optimize wide UPDs.
+struct SimplifyUPDOpsPass
+    : public PassWrapper<SimplifyUPDOpsPass, OperationPass<func::FuncOp>> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    ConversionTarget target(*context);
+    patterns.add<FuseExtIntoUPDPattern>(patterns.getContext());
+    target.addLegalDialect<aievec::AIEVecDialect>();
+    target.addDynamicallyLegalOp<aievec::ExtOp>([](aievec::ExtOp op) {
+      auto defOp = op.getSource().getDefiningOp();
+      return !defOp || !isa<aievec::UPDOp>(defOp) || !defOp->hasOneUse() ||
+             op.getIndex() != 0;
+    });
+    auto func = getOperation();
+    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
 //============================================================================//
 //=============== Main Vector2AIEVec Pipeline Configuration ==================//
 //============================================================================//
@@ -1870,6 +2125,11 @@ void xilinx::aievec::buildLowerVectorToAIEVec(
     OpPassManager &pm, const LowerVectorToAIEVecOptions &options) {
   // Add lowering from `Vector` to `AIEVec`
   pm.addPass(createLowerVectorToAIEVec(options));
+  pm.addPass(createCanonicalizerPass());
+
+  // Simplify UPD ops
+  pm.addPass(std::make_unique<ExtendUPDOpsPass>());
   pm.addPass(createCSEPass());
+  pm.addPass(std::make_unique<SimplifyUPDOpsPass>());
   pm.addPass(createCanonicalizerPass());
 }
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
index b460d0c30e..ce032fc08f 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 #include "VectorToVectorConversions.h"
 
@@ -30,25 +31,46 @@ using namespace xilinx::aievec;
 //============================================================================//
 
 // Return the offset of a given transfer read operation with regards to the
-// specified vector type. If the read is aligned size of the vector type, then
-// the offset is 0. Otherwise, the offset is the number of elements past the
-// immediately preceding aligned address.
+// specified vector type. If the read is aligned to the specified alignment
+// parameter (in bits), then the offset is 0. Otherwise, the offset is the
+// number of elements past the immediately preceding aligned vector length.
 template <
     typename TransferReadLikeOp,
     typename = std::enable_if_t<
         std::is_same_v<TransferReadLikeOp, vector::TransferReadOp> ||
         std::is_same_v<TransferReadLikeOp, vector::TransferReadOp::Adaptor>>>
-static unsigned getTransferReadAlignmentOffset(TransferReadLikeOp op,
-                                               VectorType vType) {
+static int64_t getTransferReadAlignmentOffset(TransferReadLikeOp readOp,
+                                              VectorType vType,
+                                              int64_t alignment) {
   // TODO: Add support for cases where the index is not comming from an
-  // TODO: `affine.apply` op. E.g.: when the index is a constant.
-  auto innerMostIndex = op.getIndices().back();
+  // TODO: `affine.apply` op or when the affine map has more than one
+  // TODO: dimension. We also need to address the case where the index is an
+  // TODO: induction variable.
+  auto innerMostIndex = readOp.getIndices().back();
   auto vectorLength = vType.getShape().back();
-  if (auto defOp = innerMostIndex.getDefiningOp())
-    if (auto applyOp = dyn_cast<AffineApplyOp>(defOp))
-      if (applyOp.getAffineMap().getNumDims() == 1)
-        return applyOp.getAffineMap().compose(ArrayRef<int64_t>{0})[0] %
-               vectorLength;
+  auto idxDefOp = innerMostIndex.getDefiningOp();
+  if (!idxDefOp)
+    return 0L;
+  int64_t vectorLengthAlignmentOffset =
+      TypeSwitch<Operation *, int64_t>(idxDefOp)
+          .Case<arith::ConstantOp>([&](auto constantOp) {
+            return cast<IntegerAttr>(constantOp.getValue()).getInt() %
+                   vectorLength;
+          })
+          .template Case<AffineApplyOp>([&](auto applyOp) {
+            if (applyOp.getAffineMap().getNumDims() == 1)
+              return applyOp.getAffineMap().compose(ArrayRef<int64_t>{0})[0] %
+                     vectorLength;
+            return 0L;
+          })
+          .Default([&](auto) {
+            // XXX: If we can't determine the offset, we assume the access is
+            // XXX: aligned.
+            return 0L;
+          });
+  int64_t absoluteAlignmentOffset = alignment / getElementSizeInBits(vType);
+  if (vectorLengthAlignmentOffset % absoluteAlignmentOffset)
+    return vectorLengthAlignmentOffset;
   return 0;
 }
 
@@ -64,6 +86,12 @@ struct SplitUnalignedTransferReadPattern
     : public OpConversionPattern<vector::TransferReadOp> {
   using OpConversionPattern<vector::TransferReadOp>::OpConversionPattern;
 
+  SplitUnalignedTransferReadPattern(MLIRContext *context, int64_t minVectorSize,
+                                    int64_t maxVectorSize, int64_t alignment)
+      : OpConversionPattern<vector::TransferReadOp>(context),
+        minVectorSize(minVectorSize), maxVectorSize(maxVectorSize),
+        vectorAlignment(alignment) {}
+
   LogicalResult
   matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -73,29 +101,47 @@ struct SplitUnalignedTransferReadPattern
 
     // Check if the transfer is unaligned.
     auto vType = readOp.getVectorType();
-    unsigned offset = getTransferReadAlignmentOffset(adaptor, vType);
+    int64_t offset =
+        getTransferReadAlignmentOffset(adaptor, vType, vectorAlignment);
     if (offset == 0)
       return failure();
 
-    // Create an aligned transfer read
+    // Verify that we can load a vector 2x as long as the original
+    auto vLen = vType.getShape().back();
+    auto longVecTy = VectorType::get(2 * vLen, vType.getElementType());
+    auto longVecSize = getElementSizeInBits(vType) * 2 * vLen;
+    if (longVecSize > maxVectorSize)
+      return failure();
 
     // Calculate the aligned indices for the lower and higher parts.
     // TODO: Add support for cases where the offset is greater than the
     // TODO: vector length.
-    auto lowIdx =
-        dyn_cast<AffineApplyOp>(adaptor.getIndices().back().getDefiningOp())
-            .getMapOperands()[0];
-    auto vLen = vType.getShape().back();
-    auto longVecTy = VectorType::get(2 * vLen, vType.getElementType());
+    auto loc = readOp.getLoc();
+    auto newInnerMostIdx =
+        TypeSwitch<Operation *, Value>(
+            adaptor.getIndices().back().getDefiningOp())
+            .Case<AffineApplyOp>(
+                [&](auto applyOp) { return applyOp.getMapOperands()[0]; })
+            .Case<arith::ConstantOp>([&](auto constantOp) {
+              auto cstValue = cast<IntegerAttr>(constantOp.getValue()).getInt();
+              auto newCstValue = cstValue - offset;
+              auto newConstantIdxOp = rewriter.create<arith::ConstantOp>(
+                  loc,
+                  rewriter.getIntegerAttr(constantOp.getType(), newCstValue));
+              return newConstantIdxOp.getResult();
+            })
+            .Default([&](auto) {
+              llvm_unreachable("Unexpected index type");
+              return nullptr;
+            });
     SmallVector<Value, 8> alignedIdx;
     alignedIdx.append(adaptor.getIndices().begin(), adaptor.getIndices().end());
-    alignedIdx[alignedIdx.size() - 1] = lowIdx;
+    alignedIdx[alignedIdx.size() - 1] = newInnerMostIdx;
 
     // Create the aligned transfer read for a vector 2x as long that covers the
     // elements of the unaligned vector.
     auto newReadOp = rewriter.create<vector::TransferReadOp>(
-        readOp.getLoc(), longVecTy, adaptor.getSource(), alignedIdx,
-        adaptor.getPadding());
+        loc, longVecTy, adaptor.getSource(), alignedIdx, adaptor.getPadding());
 
     // Create a `vector.extract_strided_slice` to extract the unaligned vector.
     rewriter.replaceOpWithNewOp<vector::ExtractStridedSliceOp>(
@@ -103,6 +149,10 @@ struct SplitUnalignedTransferReadPattern
 
     return success();
   }
+
+  int64_t minVectorSize;
+  int64_t maxVectorSize;
+  int64_t vectorAlignment;
 };
 
 // This pattern converts a `vector.transfer_read` with a splat permutation map
@@ -113,6 +163,9 @@ struct ConvertSplatTransferReadToBroadcastPattern
     : public OpConversionPattern<vector::TransferReadOp> {
   using OpConversionPattern<vector::TransferReadOp>::OpConversionPattern;
 
+  ConvertSplatTransferReadToBroadcastPattern(MLIRContext *context)
+      : OpConversionPattern<vector::TransferReadOp>(context) {}
+
   LogicalResult
   matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -163,16 +216,12 @@ struct ConvertSplatTransferReadToBroadcastPattern
 //============================================================================//
 //================ Common AIE canonicalization configuration =================//
 //============================================================================//
-
 static void
 configureCommonAIECanonicalizeLegalizations(ConversionTarget &target) {
-  target.addLegalDialect<vector::VectorDialect>();
+  target.addLegalDialect<arith::ArithDialect>();
   target.addLegalDialect<AffineDialect>();
   target.addLegalDialect<aievec::AIEVecDialect>();
-  target.addDynamicallyLegalOp<vector::TransferReadOp>(
-      [](vector::TransferReadOp op) {
-        return !op.getPermutationMap().isConstant();
-      });
+  target.addLegalDialect<vector::VectorDialect>();
 }
 
 static void
@@ -186,27 +235,36 @@ populateCommonAIECanonicalizeConversionPatterns(RewritePatternSet &patterns) {
 //============================================================================//
 
 static void configureAIEv1CanonicalizeLegalizations(ConversionTarget &target) {
-  target.addLegalDialect<arith::ArithDialect>();
   target.addDynamicallyLegalOp<vector::TransferReadOp>(
       [](vector::TransferReadOp op) {
         return !op.getPermutationMap().isConstant() &&
-               getTransferReadAlignmentOffset(op, op.getVectorType()) == 0;
+               getTransferReadAlignmentOffset(op, op.getVectorType(), 128) == 0;
       });
 }
 
 static void
 populateAIEv1CanonicalizeConversionPatterns(RewritePatternSet &patterns) {
-  patterns.add<SplitUnalignedTransferReadPattern>(patterns.getContext());
+  patterns.add<SplitUnalignedTransferReadPattern>(patterns.getContext(), 128,
+                                                  512, 128);
 }
 
 //============================================================================//
 //============== AIEML-specific canonicalization configuration ===============//
 //============================================================================//
 
-static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target) {}
+static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target) {
+  target.addDynamicallyLegalOp<vector::TransferReadOp>(
+      [](vector::TransferReadOp op) {
+        return !op.getPermutationMap().isConstant() &&
+               getTransferReadAlignmentOffset(op, op.getVectorType(), 256) == 0;
+      });
+}
 
 static void
-populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns) {}
+populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns) {
+  patterns.add<SplitUnalignedTransferReadPattern>(patterns.getContext(), 128,
+                                                  1024, 256);
+}
 
 //============================================================================//
 //=================== Common AIE Canonicalization Passes =====================//
diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir
index 0a2c05c6f5..37db765e3d 100644
--- a/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir
+++ b/test/Conversion/VectorToAIEVec/test-conv-op-i16.mlir
@@ -2,37 +2,40 @@
 
 func.func @conv2d(%arg0: memref<18x288xi16>, %arg1: memref<9xi16>, %arg2: memref<16x256xi16>) {
   %c0 = arith.constant 0 : index
+  %c2_i32 = arith.constant 2 : i32
+  %c4_i32 = arith.constant 4 : i32
   affine.for %arg3 = 0 to 16 {
     affine.for %arg4 = 0 to 256 step 16 {
-      %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16>
+      %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16>
+      %sbh = aievec.ext %0 {index = 0 : i8} : vector<32xi16>, vector<16xi16>
+      %sth = aievec.ext %0 {index = 1 : i8} : vector<32xi16>, vector<16xi16>
       %1 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<16xi16>
       %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<16xi16>, vector<16xi16>
-      %3 = arith.muli %0, %2 : vector<16xi16>
-      %4 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
-      %5 = aievec.upd %arg0[%arg3, %4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16>
-      %6 = aievec.broadcast %1 {idx = 1 : i8} : vector<16xi16>, vector<16xi16>
-      %7 = arith.muli %5, %6 : vector<16xi16>
-      %8 = arith.addi %3, %7 : vector<16xi16>
-      %9 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
-      %10 = aievec.upd %arg0[%arg3, %9] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<16xi16>
-      %11 = aievec.broadcast %1 {idx = 2 : i8} : vector<16xi16>, vector<16xi16>
-      %12 = arith.muli %10, %11 : vector<16xi16>
-      %13 = arith.addi %8, %12 : vector<16xi16>
-      vector.transfer_write %13, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
+      %3 = arith.muli %sbh, %2 : vector<16xi16>
+      %4 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16>
+      %5 = aievec.broadcast %1 {idx = 1 : i8} : vector<16xi16>, vector<16xi16>
+      %6 = arith.muli %4, %5 : vector<16xi16>
+      %7 = arith.addi %3, %6 : vector<16xi16>
+      %8 = aievec.shift %sbh, %sth, %c4_i32 {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16>
+      %9 = aievec.broadcast %1 {idx = 2 : i8} : vector<16xi16>, vector<16xi16>
+      %10 = arith.muli %8, %9 : vector<16xi16>
+      %11 = arith.addi %7, %10 : vector<16xi16>
+      vector.transfer_write %11, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
     }
   }
   return
 }
 
-// CHECK-LABEL:  func @conv2d
-// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi16>
-// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<9xi16>
-// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi16>
-//      CHECK:    %[[C0:.*]] = arith.constant 0 : index
-//      CHECK:    %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<32xi16>
-//      CHECK:    affine.for %[[A3:.*]] = 0 to 16 {
-//      CHECK:      affine.for %[[A4:.*]] = 0 to 256 step 16 {
-//      CHECK:        %[[T1:.*]] = aievec.upd %[[A0:.*]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16>
-//      CHECK:        %[[T2:.*]] = aievec.mul_conv %[[T1:.*]], %[[T0:.*]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64>
-//      CHECK:        %[[T3:.*]] = aievec.srs %[[T2:.*]] {shift = 10 : i8} : vector<16xi64>, vector<16xi16>
-//      CHECK:        vector.transfer_write %[[T3:.*]], %[[A2:.*]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
+// CHECK-LABEL: func @conv2d
+//  CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi16>
+//  CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<9xi16>
+//  CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi16>
+//       CHECK:    %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<9xi16>, vector<16xi16>
+//       CHECK:    %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<16xi16>, vector<32xi16>
+//       CHECK:    affine.for %[[A3:.*]] = 0 to 16 {
+//       CHECK:      affine.for %[[A4:.*]] = 0 to 256 step 16 {
+//       CHECK:        %[[T2:.*]] = aievec.upd %[[A0]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi16>, vector<32xi16>
+//       CHECK:        %[[T3:.*]] = aievec.mul_conv %[[T2]], %[[T1]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64>
+//       CHECK:        %[[T4:.*]] = aievec.srs %[[T3]] {shift = 10 : i8} : vector<16xi64>, vector<16xi16>
+//       CHECK:        vector.transfer_write %[[T4]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
index 57a98d9a1e..4792fa7190 100644
--- a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
+++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
@@ -2,21 +2,23 @@
 
 func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<16x256xi8>) {
   %c0 = arith.constant 0 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
   affine.for %arg3 = 0 to 16 {
     affine.for %arg4 = 0 to 256 step 32 {
       %0 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8>
-      %1 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
+      %1 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
+      %sbh = aievec.ext %1 {index = 0 : i8} : vector<64xi8>, vector<32xi8>
+      %sth = aievec.ext %1 {index = 1 : i8} : vector<64xi8>, vector<32xi8>
       %2 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
       %3 = aievec.broadcast %2 {idx = 0 : i8} : vector<32xi8>, vector<32xi8>
-      %4 = arith.muli %1, %3 : vector<32xi8>
+      %4 = arith.muli %sbh, %3 : vector<32xi8>
       %5 = arith.addi %0, %4 : vector<32xi8>
-      %6 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
-      %7 = aievec.upd %arg0[%arg3, %6] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
+      %7 = aievec.shift %sbh, %sth, %c1_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
       %8 = aievec.broadcast %2 {idx = 2 : i8} : vector<32xi8>, vector<32xi8>
       %9 = arith.muli %7, %8 : vector<32xi8>
       %10 = arith.addi %5, %9 : vector<32xi8>
-      %11 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
-      %12 = aievec.upd %arg0[%arg3, %11] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
+      %12 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
       %13 = aievec.broadcast %2 {idx = 4 : i8} : vector<32xi8>, vector<32xi8>
       %14 = arith.muli %12, %13 : vector<32xi8>
       %15 = arith.addi %10, %14 : vector<32xi8>
@@ -26,18 +28,19 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<
   return
 }
 
-// CHECK-LABEL:  func @conv2d
-// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8>
-// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8>
-// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8>
-//      CHECK:    %[[C0:.*]] = arith.constant 0 : index
-//      CHECK:    %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8>
-//      CHECK:    %[[T1:.*]] = aievec.shuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
-//      CHECK:    affine.for %[[A3:.*]] = 0 to 16 {
-//      CHECK:      affine.for %[[A4:.*]] = 0 to 256 step 32 {
-//      CHECK:        %[[T2:.*]] = aievec.upd %[[A2]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8>
-//      CHECK:        %[[T3:.*]] = aievec.upd %[[A0]][%[[A3]], %[[A4]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
-//      CHECK:        %[[T4:.*]] = aievec.ups %[[T2]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32>
-//      CHECK:        %[[T5:.*]] = aievec.fma_conv %[[T3]], %[[T1]], %[[T4]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
-//      CHECK:        %[[T6:.*]] = aievec.srs %[[T5]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8>
-//      CHECK:        vector.transfer_write %[[T6]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
+// CHECK-LABEL: func @conv2d
+//  CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8>
+//  CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8>
+//  CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8>
+//       CHECK:    %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
+//       CHECK:    %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8>
+//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//       CHECK:    affine.for %[[I:.*]] = 0 to 16 {
+//       CHECK:      affine.for %[[J:.*]] = 0 to 256 step 32 {
+//       CHECK:        %[[T3:.*]] = aievec.upd %[[A2]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<16x256xi8>, vector<32xi8>
+//       CHECK:        %[[T4:.*]] = aievec.upd %[[A0]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
+//       CHECK:        %[[T5:.*]] = aievec.ups %[[T3]] {shift = 0 : i8} : vector<32xi8>, vector<32xi32>
+//       CHECK:        %[[T6:.*]] = aievec.fma_conv %[[T4]], %[[T2]], %[[T5]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
+//       CHECK:        %[[T7:.*]] = aievec.srs %[[T6]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8>
+//       CHECK:        vector.transfer_write %[[T7]], %[[A2]][%[[I]], %[[J]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
index 5a5d4901f0..50f6d49a47 100644
--- a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
+++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
@@ -2,38 +2,41 @@
 
 func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<16x256xi8>) {
   %c0 = arith.constant 0 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
   affine.for %arg3 = 0 to 16 {
     affine.for %arg4 = 0 to 256 step 32 {
-      %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
+      %0 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
+      %sbh = aievec.ext %0 {index = 0 : i8} : vector<64xi8>, vector<32xi8>
+      %sth = aievec.ext %0 {index = 1 : i8} : vector<64xi8>, vector<32xi8>
       %1 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
       %2 = aievec.broadcast %1 {idx = 0 : i8} : vector<32xi8>, vector<32xi8>
-      %3 = arith.muli %0, %2 : vector<32xi8>
-      %4 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
-      %5 = aievec.upd %arg0[%arg3, %4] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
-      %6 = aievec.broadcast %1 {idx = 2 : i8} : vector<32xi8>, vector<32xi8>
-      %7 = arith.muli %5, %6 : vector<32xi8>
-      %8 = arith.addi %3, %7 : vector<32xi8>
-      %9 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
-      %10 = aievec.upd %arg0[%arg3, %9] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<32xi8>
-      %11 = aievec.broadcast %1 {idx = 4 : i8} : vector<32xi8>, vector<32xi8>
-      %12 = arith.muli %10, %11 : vector<32xi8>
-      %13 = arith.addi %8, %12 : vector<32xi8>
-      vector.transfer_write %13, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
+      %3 = arith.muli %sbh, %2 : vector<32xi8>
+      %4 = aievec.shift %sbh, %sth, %c1_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
+      %5 = aievec.broadcast %1 {idx = 2 : i8} : vector<32xi8>, vector<32xi8>
+      %6 = arith.muli %4, %5 : vector<32xi8>
+      %7 = arith.addi %3, %6 : vector<32xi8>
+      %8 = aievec.shift %sbh, %sth, %c2_i32 {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
+      %9 = aievec.broadcast %1 {idx = 4 : i8} : vector<32xi8>, vector<32xi8>
+      %10 = arith.muli %8, %9 : vector<32xi8>
+      %11 = arith.addi %7, %10 : vector<32xi8>
+      vector.transfer_write %11, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
     }
   }
   return
 }
 
-// CHECK-LABEL:  func @conv2d
-// CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8>
-// CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8>
-// CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8>
-//      CHECK:    %[[C0:.*]] = arith.constant 0 : index
-//      CHECK:    %[[T0:.*]] = aievec.upd %[[A1:.*]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8>
-//      CHECK:    %[[T1:.*]] = aievec.shuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
-//      CHECK:    affine.for %[[A3:.*]] = 0 to 16 {
-//      CHECK:      affine.for %[[A4:.*]] = 0 to 256 step 32 {
-//      CHECK:        %[[T2:.*]] = aievec.upd %[[A0:.*]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
-//      CHECK:        %[[T3:.*]] = aievec.mul_conv %[[T2:.*]], %[[T1:.*]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
-//      CHECK:        %[[T4:.*]] = aievec.srs %[[T3:.*]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8>
-//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2:.*]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
+// CHECK-LABEL: func @conv2d
+//  CHECK-SAME: %[[A0:[A-Za-z0-9]+]]: memref<18x288xi8>
+//  CHECK-SAME: %[[A1:[A-Za-z0-9]+]]: memref<48xi8>
+//  CHECK-SAME: %[[A2:[A-Za-z0-9]+]]: memref<16x256xi8>
+//       CHECK:    %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
+//       CHECK:    %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8>
+//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//       CHECK:    affine.for %[[I:.*]] = 0 to 16 {
+//       CHECK:      affine.for %[[J:.*]] = 0 to 256 step 32 {
+//       CHECK:        %[[T3:.*]] = aievec.upd %[[A0]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : si32} : memref<18x288xi8>, vector<64xi8>
+//       CHECK:        %[[T4:.*]] = aievec.mul_conv %[[T3]], %[[T2]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
+//       CHECK:        %[[T5:.*]] = aievec.srs %[[T4]] {shift = 0 : i8} : vector<32xi32>, vector<32xi8>
+//       CHECK:        vector.transfer_write %[[T5]], %[[A2]][%[[I]], %[[J]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
diff --git a/test/Conversion/VectorToAIEVec/test-upd.mlir b/test/Conversion/VectorToAIEVec/test-upd.mlir
index a246394108..ada3619e6f 100644
--- a/test/Conversion/VectorToAIEVec/test-upd.mlir
+++ b/test/Conversion/VectorToAIEVec/test-upd.mlir
@@ -1,49 +1,66 @@
-// RUN: aie-opt %s --convert-vector-to-aievec | FileCheck %s
-// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" | FileCheck %s --check-prefix=CHECK-V2
+// RUN: aie-opt %s --convert-vector-to-aievec -split-input-file | FileCheck %s
+// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK-V2
 
+// CHECK-V2-LABEL: func @veccopy_i8
 func.func @veccopy_i8(%arg0: memref<256xi8>, %arg1: memref<256xi8>) {
   %c0_i8 = arith.constant 0 : i8
   affine.for %arg2 = 0 to 256 step 16 {
-    // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi8>, vector<16xi8>
+    // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi8>, vector<16xi8>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i8 : memref<256xi8>, vector<16xi8>
-    // CHECK: vector.transfer_write %[[LD]], {{.*}}
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}}
     vector.transfer_write %0, %arg1[%arg2] : vector<16xi8>, memref<256xi8>
   }
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @veccopy_i16
+// CHECK-V2-LABEL: func @veccopy_i16
 func.func @veccopy_i16(%arg0: memref<256xi16>, %arg1: memref<256xi16>) {
   %c0_i16 = arith.constant 0 : i16
   affine.for %arg2 = 0 to 256 step 16 {
     // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi16>, vector<16xi16>
+    // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi16>, vector<16xi16>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i16 : memref<256xi16>, vector<16xi16>
     // CHECK: vector.transfer_write %[[LD]], {{.*}}
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}}
     vector.transfer_write %0, %arg1[%arg2] : vector<16xi16>, memref<256xi16>
   }
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @veccopy_i32
+// CHECK-V2-LABEL: func @veccopy_i32
 func.func @veccopy_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) {
   %c0_i32 = arith.constant 0 : i32
-  affine.for %arg2 = 0 to 256 step 16 {
-    // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32>
-    %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<16xi32>
+  affine.for %arg2 = 0 to 256 step 8 {
+    // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<8xi32>
+    // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<8xi32>
+    %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<8xi32>
     // CHECK: vector.transfer_write %[[LD]], {{.*}}
-    vector.transfer_write %0, %arg1[%arg2] : vector<16xi32>, memref<256xi32>
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}}
+    vector.transfer_write %0, %arg1[%arg2] : vector<8xi32>, memref<256xi32>
   }
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @veccopy_long_i32
+// CHECK-V2-LABEL: func @veccopy_long_i32
 func.func @veccopy_long_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) {
   %c0_i32 = arith.constant 0 : i32
-  affine.for %arg2 = 0 to 256 step 32 {
-    // CHECK: %[[LD0:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<32xi32>
-    // CHECK-NEXT: %[[LD1:.*]] = aievec.upd {{.*}}, %[[LD0]] {index = 1 : i8, offset = 512 : si32} : memref<256xi32>, vector<32xi32>
+  affine.for %arg2 = 0 to 256 step 16 {
+    // CHECK: %[[LD0:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32>
+    // CHECK-NEXT: %[[LD1:.*]] = aievec.upd {{.*}}, %[[LD0]] {index = 1 : i8, offset = 256 : si32} : memref<256xi32>, vector<16xi32>
     // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : si32} : memref<256xi32>, vector<16xi32>
-    %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<32xi32>
+    %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<16xi32>
     // CHECK: vector.transfer_write %[[LD1]], {{.*}}
     // CHECK-V2: vector.transfer_write %[[LD]], {{.*}}
-    vector.transfer_write %0, %arg1[%arg2] : vector<32xi32>, memref<256xi32>
+    vector.transfer_write %0, %arg1[%arg2] : vector<16xi32>, memref<256xi32>
   }
   return
 }
diff --git a/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir b/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir
new file mode 100644
index 0000000000..8774b1f4bc
--- /dev/null
+++ b/test/Conversion/VectorToAIEVec/unaligned-load-aieml.mlir
@@ -0,0 +1,25 @@
+// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK
+func.func @unaligned_read(%a: memref<48xi8>) -> (vector<32xi8>, vector<32xi8>) {
+   %c0_i8 = arith.constant 0 : i8
+   %c16 = arith.constant 16 : index
+   %c34 = arith.constant 34 : index
+   %0 = vector.transfer_read %a[%c16], %c0_i8 : memref<48xi8>, vector<32xi8>
+   %1 = vector.transfer_read %a[%c34], %c0_i8 : memref<48xi8>, vector<32xi8>
+   return %0, %1 : vector<32xi8>, vector<32xi8>
+}
+
+// CHECK-LABEL: func @unaligned_read
+//       CHECK:    %[[C2i32:.*]] = arith.constant 2 : i32
+//       CHECK:    %[[C32:.*]] = arith.constant 32 : index
+//       CHECK:    %[[C16i32:.*]] = arith.constant 16 : i32
+//       CHECK:    %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:    %[[T0:.*]] = aievec.upd {{.*}}[%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8>
+//       CHECK:    %[[T0E0:.*]] = aievec.ext %[[T0]] {index = 0 : i8} : vector<64xi8>, vector<32xi8>
+//       CHECK:    %[[T0E1:.*]] = aievec.ext %[[T0]] {index = 1 : i8} : vector<64xi8>, vector<32xi8>
+//       CHECK:    %[[R0:.*]] = aievec.shift %[[T0E0]], %[[T0E1]], %[[C16i32]] {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
+//       CHECK:    %[[T1:.*]] = aievec.upd {{.*}}[%[[C32:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<64xi8>
+//       CHECK:    %[[T1E0:.*]] = aievec.ext %[[T1]] {index = 0 : i8} : vector<64xi8>, vector<32xi8>
+//       CHECK:    %[[T1E1:.*]] = aievec.ext %[[T1]] {index = 1 : i8} : vector<64xi8>, vector<32xi8>
+//       CHECK:    %[[R1:.*]] = aievec.shift %[[T1E0]], %[[T1E1]], %[[C2i32]] {isAcc = false} : vector<32xi8>, vector<32xi8>, i32, vector<32xi8>
+//       CHECK:    return %[[R0:.*]], %[[R1:.*]] : vector<32xi8>, vector<32xi8>
+
diff --git a/test/Conversion/VectorToAIEVec/unaligned-load.mlir b/test/Conversion/VectorToAIEVec/unaligned-load.mlir
index d547c0f9ac..2e4f8e4503 100644
--- a/test/Conversion/VectorToAIEVec/unaligned-load.mlir
+++ b/test/Conversion/VectorToAIEVec/unaligned-load.mlir
@@ -1,20 +1,70 @@
-// RUN: aie-opt %s --convert-vector-to-aievec | FileCheck %s
-func.func @unaligned_read(%a: memref<48xi8>) -> (vector<32xi8>, vector<32xi8>) {
-   %c0_i8 = arith.constant 0 : i8
-   %c16 = arith.constant 16 : index
-   %c34 = arith.constant 34 : index
-   %0 = vector.transfer_read %a[%c16], %c0_i8 : memref<48xi8>, vector<32xi8>
-   %1 = vector.transfer_read %a[%c34], %c0_i8 : memref<48xi8>, vector<32xi8>
-   return %0, %1 : vector<32xi8>, vector<32xi8>
+// RUN: aie-opt %s --convert-vector-to-aievec -split-input-file | FileCheck %s
+// RUN: aie-opt %s --convert-vector-to-aievec="aie-target=aieml" -split-input-file | FileCheck %s --check-prefix=CHECK-V2
+
+// CHECK-LABEL: func @unaligned_read
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[V0B:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi32>, vector<16xi32>
+// CHECK: %[[V0T:.*]] = aievec.upd %{{.*}}[%[[C0]]], %[[V0B]] {index = 1 : i8, offset = 256 : si32} : memref<64xi32>, vector<16xi32>
+// CHECK: %[[V0ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x76543210", xsquare = "0x3210", xstart = "3",
+// CHECK-SAME:                                                  yoffsets = "0", ysquare = "0", ystart = "0"}
+// CHECK-SAME:                                   : vector<16xi32>, vector<16xi32>
+// CHECK: %[[V0:.*]] = aievec.ext %[[V0ROT]] {index = 0 : i8} : vector<16xi32>, vector<8xi32>
+// CHECK: %[[V1ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x76543210", xsquare = "0x3210", xstart = "6",
+// CHECK-SAME:                                                  yoffsets = "0", ysquare = "0", ystart = "0"}
+// CHECK-SAME:                                   : vector<16xi32>, vector<16xi32>
+// CHECK: %[[V1:.*]] = aievec.ext %[[V1ROT]] {index = 0 : i8} : vector<16xi32>, vector<8xi32>
+// CHECK: return %[[V0]], %[[V1]] : vector<8xi32>, vector<8xi32>
+
+// CHECK-V2-LABEL: func @unaligned_read
+// CHECK-V2: %[[C24i32:.*]] = arith.constant 24 : i32
+// CHECK-V2: %[[C12i32:.*]] = arith.constant 12 : i32
+// CHECK-V2: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-V2: %[[LV:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi32>, vector<16xi32>
+// CHECK-V2: %[[LV0:.*]] = aievec.ext %[[LV]] {index = 0 : i8} : vector<16xi32>, vector<8xi32>
+// CHECK-V2: %[[LV1:.*]] = aievec.ext %[[LV]] {index = 1 : i8} : vector<16xi32>, vector<8xi32>
+// CHECK-V2: %[[R0:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C12i32]] {isAcc = false} : vector<8xi32>, vector<8xi32>, i32, vector<8xi32>
+// CHECK-V2: %[[R1:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C24i32]] {isAcc = false} : vector<8xi32>, vector<8xi32>, i32, vector<8xi32>
+// CHECK-V2: return %[[R0]], %[[R1]] : vector<8xi32>, vector<8xi32>
+func.func @unaligned_read(%m: memref<64xi32>) -> (vector<8xi32>, vector<8xi32>) {
+   %c0_i32 = arith.constant 0 : i32
+   %c3 = arith.constant 3 : index
+   %c6 = arith.constant 6 : index
+   %0 = vector.transfer_read %m[%c3], %c0_i32 : memref<64xi32>, vector<8xi32>
+   %1 = vector.transfer_read %m[%c6], %c0_i32 : memref<64xi32>, vector<8xi32>
+   return %0, %1 : vector<8xi32>, vector<8xi32>
 }
 
+// -----
+
 // CHECK-LABEL: func @unaligned_read
-// CHECK      :    %[[C64:.*]] = arith.constant 64 : index
-// CHECK      :    %[[C32:.*]] = arith.constant 32 : index
-// CHECK      :    %[[C0:.*]] = arith.constant 0 : index
-// CHECK      :    %[[T0:.*]] = aievec.upd {{.*}}[%[[C0:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
-// CHECK      :    %[[T1:.*]] = aievec.upd {{.*}}[%[[C32:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
-// CHECK      :    %[[T2:.*]] = aievec.shift %[[T0:.*]], %[[T1:.*]] {shift = 16 : i32} : vector<32xi8>, vector<32xi8>
-// CHECK      :    %[[T3:.*]] = aievec.upd {{.*}}[%[[C64:.*]]] {index = 0 : i8, offset = 0 : si32} : memref<48xi8>, vector<32xi8>
-// CHECK      :    %[[T4:.*]] = aievec.shift %[[T1:.*]], %[[T3:.*]] {shift = 2 : i32} : vector<32xi8>, vector<32xi8>
-// CHECK      :    return %[[T2:.*]], %[[T4:.*]] : vector<32xi8>, vector<32xi8>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[V0B:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi16>, vector<32xi16>
+// CHECK: %[[V0T:.*]] = aievec.upd %{{.*}}[%[[C0]]], %[[V0B]] {index = 1 : i8, offset = 256 : si32} : memref<64xi16>, vector<32xi16>
+// CHECK: %[[V0ROT:.*]] = aievec.select %[[V0T]] {select = "0x11111111", xoffsets = "0x06040200", xoffsets_hi = "0x0e0c0a08", xsquare = "0x2103", xstart = "4",
+// CHECK-SAME:                                                           yoffsets = "0x0503010f", yoffsets_hi = "0x0d0b0907", ysquare = "0x2103", ystart = "2"}
+// CHECK-SAME:                                   : vector<32xi16>, vector<32xi16>
+// CHECK: %[[V0:.*]] = aievec.ext %[[V0ROT]] {index = 0 : i8} : vector<32xi16>, vector<16xi16>
+// CHECK: %[[V1ROT:.*]] = aievec.select %[[V0T]] {select = "0", xoffsets = "0x06040200", xoffsets_hi = "0x0e0c0a08", xsquare = "0x3210", xstart = "6",
+// CHECK-SAME:                                                  yoffsets = "0", yoffsets_hi = "0", ysquare = "0", ystart = "0"}
+// CHECK-SAME:                                   : vector<32xi16>, vector<32xi16>
+// CHECK: %[[V1:.*]] = aievec.ext %[[V1ROT]] {index = 0 : i8} : vector<32xi16>, vector<16xi16>
+// CHECK: return %[[V0]], %[[V1]] : vector<16xi16>, vector<16xi16>
+
+// CHECK-V2-LABEL: func @unaligned_read
+// CHECK-V2: %[[C12i32:.*]] = arith.constant 12 : i32
+// CHECK-V2: %[[C6i32:.*]] = arith.constant 6 : i32
+// CHECK-V2: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-V2: %[[LV:.*]] = aievec.upd %{{.*}}[%[[C0]]] {index = 0 : i8, offset = 0 : si32} : memref<64xi16>, vector<32xi16>
+// CHECK-V2: %[[LV0:.*]] = aievec.ext %[[LV]] {index = 0 : i8} : vector<32xi16>, vector<16xi16>
+// CHECK-V2: %[[LV1:.*]] = aievec.ext %[[LV]] {index = 1 : i8} : vector<32xi16>, vector<16xi16>
+// CHECK-V2: %[[R0:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C6i32]] {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16>
+// CHECK-V2: %[[R1:.*]] = aievec.shift %[[LV0]], %[[LV1]], %[[C12i32]] {isAcc = false} : vector<16xi16>, vector<16xi16>, i32, vector<16xi16>
+// CHECK-V2: return %[[R0]], %[[R1]] : vector<16xi16>, vector<16xi16>
+func.func @unaligned_read(%m: memref<64xi16>) -> (vector<16xi16>, vector<16xi16>) {
+   %c0_i16 = arith.constant 0 : i16
+   %c3 = arith.constant 3 : index
+   %c6 = arith.constant 6 : index
+   %0 = vector.transfer_read %m[%c3], %c0_i16 : memref<64xi16>, vector<16xi16>
+   %1 = vector.transfer_read %m[%c6], %c0_i16 : memref<64xi16>, vector<16xi16>
+   return %0, %1 : vector<16xi16>, vector<16xi16>
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc
new file mode 100644
index 0000000000..c1347a4848
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/helplib.cc
@@ -0,0 +1,48 @@
+#include "aie_api/aie.hpp"
+#include "aie_api/utils.hpp"
+#include <cstdint>
+#include <cstdio>
+
+template <typename T> const char *tid() { return "@"; }
+
+template <> const char *tid<int8_t>() { return "i"; }
+template <> const char *tid<int16_t>() { return "i"; }
+template <> const char *tid<int32_t>() { return "i"; }
+
+template <int nlanes, typename elemtype, typename vtype> void printv(vtype v) {
+  printf("vector<%dx%s%u>[ ", nlanes, tid<elemtype>(), 8 * sizeof(elemtype));
+  aie::print(aie::vector<elemtype, nlanes>(v));
+  printf("]\n");
+}
+
+void printv16xi32(v16int32 v) { printv<16, int32_t>(v); }
+
+void printv8xi32(v8int32 v) { printv<8, int32_t>(v); }
+
+void printv32xi16(v32int16 v) { printv<32, int16_t>(v); }
+
+void printv16xi16(v16int16 v) { printv<16, int16_t>(v); }
+
+void printv32xi8(v32int8 v) { printv<32, int8_t>(v); }
+
+alignas(32) int32_t buff_i32[64];
+alignas(32) int16_t buff_i16[64];
+alignas(32) int8_t buff_i8[64];
+
+int32_t *loadA64xi32() {
+  for (int i = 0; i < 64; ++i)
+    buff_i32[i] = i;
+  return buff_i32;
+}
+
+int16_t *loadA64xi16() {
+  for (int i = 0; i < 64; ++i)
+    buff_i16[i] = i;
+  return buff_i16;
+}
+
+int8_t *loadA64xi8() {
+  for (int i = 0; i < 64; ++i)
+    buff_i8[i] = i;
+  return buff_i8;
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir
new file mode 100644
index 0000000000..0834252631
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/kernel.mlir
@@ -0,0 +1,81 @@
+// REQUIRES: valid_xchess_license
+// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc
+// RUN: echo "#include <cstdint>" > kernel.cc && cat kernel.tmp.cc >> kernel.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s
+
+func.func private @printv32xi16(%v : vector<32xi16>)
+func.func private @loadA64xi16() -> memref<64xi16>
+
+#map6 = affine_map<(d0) -> (d0 + 6)>
+#map7 = affine_map<(d0) -> (d0 + 7)>
+#map8 = affine_map<(d0) -> (d0 + 8)>
+#map9 = affine_map<(d0) -> (d0 + 9)>
+#map10 = affine_map<(d0) -> (d0 + 10)>
+
+func.func @entry() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i16 = arith.constant 0 : i16
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c8 = arith.constant 8 : index
+  %c9 = arith.constant 9 : index
+  %c10 = arith.constant 10 : index
+  %c11 = arith.constant 11 : index
+  %c12 = arith.constant 12 : index
+  %c13 = arith.constant 13 : index
+  %c14 = arith.constant 14 : index
+  %c15 = arith.constant 15 : index
+
+  %buffi16 = func.call @loadA64xi16() : () -> (memref<64xi16>)
+  %v16 = vector.transfer_read %buffi16[%c0], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%v16) : (vector<32xi16>) -> ()
+
+  %1 = vector.transfer_read %buffi16[%c1], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%1) : (vector<32xi16>) -> ()
+  %2 = vector.transfer_read %buffi16[%c2], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%2) : (vector<32xi16>) -> ()
+  %3 = vector.transfer_read %buffi16[%c3], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%3) : (vector<32xi16>) -> ()
+  %4 = vector.transfer_read %buffi16[%c4], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%4) : (vector<32xi16>) -> ()
+  %5 = vector.transfer_read %buffi16[%c5], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%5) : (vector<32xi16>) -> ()
+
+  %i6 = affine.apply #map6(%c0)
+  %6 = vector.transfer_read %buffi16[%i6], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%6) : (vector<32xi16>) -> ()
+  %i7 = affine.apply #map7(%c0)
+  %7 = vector.transfer_read %buffi16[%i7], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%7) : (vector<32xi16>) -> ()
+  %i8 = affine.apply #map8(%c0)
+  %8 = vector.transfer_read %buffi16[%i8], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%8) : (vector<32xi16>) -> ()
+  %i9 = affine.apply #map9(%c0)
+  %9 = vector.transfer_read %buffi16[%i9], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%9) : (vector<32xi16>) -> ()
+  %i10 = affine.apply #map10(%c0)
+  %10 = vector.transfer_read %buffi16[%i10], %c0_i16 : memref<64xi16>, vector<32xi16>
+  func.call @printv32xi16(%10) : (vector<32xi16>) -> ()
+
+  return %c0_i32 : i32
+}
+
+// CHECK-LABEL: vector<32xi16>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ]
+// CHECK-LABEL: vector<32xi16>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 ]
+// CHECK-LABEL: vector<32xi16>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 ]
+// CHECK-LABEL: vector<32xi16>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 ]
+// CHECK-LABEL: vector<32xi16>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
+// CHECK-LABEL: vector<32xi16>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 ]
+// CHECK-LABEL: vector<32xi16>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 ]
+// CHECK-LABEL: vector<32xi16>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 ]
+// CHECK-LABEL: vector<32xi16>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ]
+// CHECK-LABEL: vector<32xi16>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 ]
+// CHECK-LABEL: vector<32xi16>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 ]
+// CHECK-LABEL: SUCCESS
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc
new file mode 100644
index 0000000000..a81ee4f46c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i16/main.cc
@@ -0,0 +1,11 @@
+#include <cstdio>
+
+int entry(void);
+
+int main(void) {
+  int r = entry();
+  if (r)
+    printf("ERROR: %d", r);
+  printf("SUCCESS");
+  return r;
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc
new file mode 100644
index 0000000000..c1347a4848
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/helplib.cc
@@ -0,0 +1,48 @@
+#include "aie_api/aie.hpp"
+#include "aie_api/utils.hpp"
+#include <cstdint>
+#include <cstdio>
+
+template <typename T> const char *tid() { return "@"; }
+
+template <> const char *tid<int8_t>() { return "i"; }
+template <> const char *tid<int16_t>() { return "i"; }
+template <> const char *tid<int32_t>() { return "i"; }
+
+template <int nlanes, typename elemtype, typename vtype> void printv(vtype v) {
+  printf("vector<%dx%s%u>[ ", nlanes, tid<elemtype>(), 8 * sizeof(elemtype));
+  aie::print(aie::vector<elemtype, nlanes>(v));
+  printf("]\n");
+}
+
+void printv16xi32(v16int32 v) { printv<16, int32_t>(v); }
+
+void printv8xi32(v8int32 v) { printv<8, int32_t>(v); }
+
+void printv32xi16(v32int16 v) { printv<32, int16_t>(v); }
+
+void printv16xi16(v16int16 v) { printv<16, int16_t>(v); }
+
+void printv32xi8(v32int8 v) { printv<32, int8_t>(v); }
+
+alignas(32) int32_t buff_i32[64];
+alignas(32) int16_t buff_i16[64];
+alignas(32) int8_t buff_i8[64];
+
+int32_t *loadA64xi32() {
+  for (int i = 0; i < 64; ++i)
+    buff_i32[i] = i;
+  return buff_i32;
+}
+
+int16_t *loadA64xi16() {
+  for (int i = 0; i < 64; ++i)
+    buff_i16[i] = i;
+  return buff_i16;
+}
+
+int8_t *loadA64xi8() {
+  for (int i = 0; i < 64; ++i)
+    buff_i8[i] = i;
+  return buff_i8;
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir
new file mode 100644
index 0000000000..a3ea4b8cc8
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/kernel.mlir
@@ -0,0 +1,35 @@
+// REQUIRES: valid_xchess_license
+// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc
+// RUN: echo "#include <cstdint>" > kernel.cc && cat kernel.tmp.cc >> kernel.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s
+
+func.func private @printv16xi32(%v : vector<16xi32>)
+func.func private @loadA64xi32() -> memref<64xi32>
+
+#map6 = affine_map<(d0) -> (d0 + 6)>
+
+func.func @entry() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5 : index
+
+  %buffi32 = func.call @loadA64xi32() : () -> (memref<64xi32>)
+
+  %v0 = vector.transfer_read %buffi32[%c0], %c0_i32 : memref<64xi32>, vector<16xi32>
+  func.call @printv16xi32(%v0) : (vector<16xi32>) -> ()
+
+  %v5 = vector.transfer_read %buffi32[%c5], %c0_i32 : memref<64xi32>, vector<16xi32>
+  func.call @printv16xi32(%v5) : (vector<16xi32>) -> ()
+
+  %idx6 = affine.apply #map6(%c0)
+  %v6 = vector.transfer_read %buffi32[%idx6], %c0_i32 : memref<64xi32>, vector<16xi32>
+  func.call @printv16xi32(%v6) : (vector<16xi32>) -> ()
+
+  return %c0_i32 : i32
+}
+
+// CHECK-LABEL: vector<16xi32>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ]
+// CHECK-LABEL: vector<16xi32>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ]
+// CHECK-LABEL: vector<16xi32>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ]
+// CHECK-LABEL: SUCCESS
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc
new file mode 100644
index 0000000000..a81ee4f46c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i32/main.cc
@@ -0,0 +1,11 @@
+#include <cstdio>
+
+int entry(void);
+
+int main(void) {
+  int r = entry();
+  if (r)
+    printf("ERROR: %d", r);
+  printf("SUCCESS");
+  return r;
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc
new file mode 100644
index 0000000000..4769c84910
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/helplib.cc
@@ -0,0 +1,50 @@
+#include "aie_api/aie.hpp"
+#include "aie_api/utils.hpp"
+#include <cstdint>
+#include <cstdio>
+
+template <typename T> const char *tid() { return "@"; }
+
+template <> const char *tid<int8_t>() { return "i"; }
+template <> const char *tid<int16_t>() { return "i"; }
+template <> const char *tid<int32_t>() { return "i"; }
+
+template <int nlanes, typename elemtype, typename vtype> void printv(vtype v) {
+  printf("vector<%dx%s%u>[ ", nlanes, tid<elemtype>(), 8 * sizeof(elemtype));
+  aie::print(aie::vector<elemtype, nlanes>(v));
+  printf("]\n");
+}
+
+void printv16xi32(v16int32 v) { printv<16, int32_t>(v); }
+
+void printv8xi32(v8int32 v) { printv<8, int32_t>(v); }
+
+void printv32xi16(v32int16 v) { printv<32, int16_t>(v); }
+
+void printv16xi16(v16int16 v) { printv<16, int16_t>(v); }
+
+void printv32xi8(v32int8 v) { printv<32, int8_t>(v); }
+
+void printv64xi8(v64int8 v) { printv<64, int8_t>(v); }
+
+alignas(32) int32_t buff_i32[64];
+alignas(32) int16_t buff_i16[64];
+alignas(32) int8_t buff_i8[128];
+
+int32_t *loadA64xi32() {
+  for (int i = 0; i < 64; ++i)
+    buff_i32[i] = i;
+  return buff_i32;
+}
+
+int16_t *loadA64xi16() {
+  for (int i = 0; i < 64; ++i)
+    buff_i16[i] = i;
+  return buff_i16;
+}
+
+int8_t *loadA128xi8() {
+  for (int i = 0; i < 128; ++i)
+    buff_i8[i] = i;
+  return buff_i8;
+}
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir
new file mode 100644
index 0000000000..f7f445e360
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/kernel.mlir
@@ -0,0 +1,81 @@
+// REQUIRES: valid_xchess_license
+// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aieml" | aie-translate -aieml=true -aievec-to-cpp -o kernel.tmp.cc
+// RUN: echo "#include <cstdint>" > kernel.cc && cat kernel.tmp.cc >> kernel.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ -D__AIEARCH__=20 kernel.cc %S/helplib.cc %S/main.cc
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s
+
+func.func private @printv64xi8(%v : vector<64xi8>)
+func.func private @loadA128xi8() -> memref<128xi8>
+
+#map6 = affine_map<(d0) -> (d0 + 6)>
+#map7 = affine_map<(d0) -> (d0 + 7)>
+#map8 = affine_map<(d0) -> (d0 + 8)>
+#map9 = affine_map<(d0) -> (d0 + 9)>
+#map10 = affine_map<(d0) -> (d0 + 10)>
+
+func.func @entry() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c8 = arith.constant 8 : index
+  %c9 = arith.constant 9 : index
+  %c10 = arith.constant 10 : index
+  %c11 = arith.constant 11 : index
+  %c12 = arith.constant 12 : index
+  %c13 = arith.constant 13 : index
+  %c14 = arith.constant 14 : index
+  %c15 = arith.constant 15 : index
+
+  %buffi8 = func.call @loadA128xi8() : () -> (memref<128xi8>)
+  %v16 = vector.transfer_read %buffi8[%c0], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%v16) : (vector<64xi8>) -> ()
+
+  %1 = vector.transfer_read %buffi8[%c1], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%1) : (vector<64xi8>) -> ()
+  %2 = vector.transfer_read %buffi8[%c2], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%2) : (vector<64xi8>) -> ()
+  %3 = vector.transfer_read %buffi8[%c3], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%3) : (vector<64xi8>) -> ()
+  %4 = vector.transfer_read %buffi8[%c4], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%4) : (vector<64xi8>) -> ()
+  %5 = vector.transfer_read %buffi8[%c5], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%5) : (vector<64xi8>) -> ()
+
+  %i6 = affine.apply #map6(%c0)
+  %6 = vector.transfer_read %buffi8[%i6], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%6) : (vector<64xi8>) -> ()
+  %i7 = affine.apply #map7(%c0)
+  %7 = vector.transfer_read %buffi8[%i7], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%7) : (vector<64xi8>) -> ()
+  %i8 = affine.apply #map8(%c0)
+  %8 = vector.transfer_read %buffi8[%i8], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%8) : (vector<64xi8>) -> ()
+  %i9 = affine.apply #map9(%c0)
+  %9 = vector.transfer_read %buffi8[%i9], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%9) : (vector<64xi8>) -> ()
+  %i10 = affine.apply #map10(%c0)
+  %10 = vector.transfer_read %buffi8[%i10], %c0_i8 : memref<128xi8>, vector<64xi8>
+  func.call @printv64xi8(%10) : (vector<64xi8>) -> ()
+
+  return %c0_i32 : i32
+}
+
+// CHECK-LABEL: vector<64xi8>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 ]
+// CHECK-LABEL: vector<64xi8>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 ]
+// CHECK-LABEL: vector<64xi8>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 ]
+// CHECK-LABEL: vector<64xi8>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 ]
+// CHECK-LABEL: vector<64xi8>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 ]
+// CHECK-LABEL: vector<64xi8>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 ]
+// CHECK-LABEL: vector<64xi8>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 ]
+// CHECK-LABEL: vector<64xi8>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 ]
+// CHECK-LABEL: vector<64xi8>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 ]
+// CHECK-LABEL: vector<64xi8>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 ]
+// CHECK-LABEL: vector<64xi8>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 ]
+// CHECK-LABEL: SUCCESS
diff --git a/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc
new file mode 100644
index 0000000000..a81ee4f46c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/ml_unaligned_read/i8/main.cc
@@ -0,0 +1,11 @@
+#include <cstdio>
+
+int entry(void);
+
+int main(void) {
+  int r = entry();
+  if (r)
+    printf("ERROR: %d", r);
+  printf("SUCCESS");
+  return r;
+}
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc
new file mode 100644
index 0000000000..c1347a4848
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/helplib.cc
@@ -0,0 +1,48 @@
+#include "aie_api/aie.hpp"
+#include "aie_api/utils.hpp"
+#include <cstdint>
+#include <cstdio>
+
+template <typename T> const char *tid() { return "@"; }
+
+template <> const char *tid<int8_t>() { return "i"; }
+template <> const char *tid<int16_t>() { return "i"; }
+template <> const char *tid<int32_t>() { return "i"; }
+
+template <int nlanes, typename elemtype, typename vtype> void printv(vtype v) {
+  printf("vector<%dx%s%u>[ ", nlanes, tid<elemtype>(), 8 * sizeof(elemtype));
+  aie::print(aie::vector<elemtype, nlanes>(v));
+  printf("]\n");
+}
+
+void printv16xi32(v16int32 v) { printv<16, int32_t>(v); }
+
+void printv8xi32(v8int32 v) { printv<8, int32_t>(v); }
+
+void printv32xi16(v32int16 v) { printv<32, int16_t>(v); }
+
+void printv16xi16(v16int16 v) { printv<16, int16_t>(v); }
+
+void printv32xi8(v32int8 v) { printv<32, int8_t>(v); }
+
+alignas(32) int32_t buff_i32[64];
+alignas(32) int16_t buff_i16[64];
+alignas(32) int8_t buff_i8[64];
+
+int32_t *loadA64xi32() {
+  for (int i = 0; i < 64; ++i)
+    buff_i32[i] = i;
+  return buff_i32;
+}
+
+int16_t *loadA64xi16() {
+  for (int i = 0; i < 64; ++i)
+    buff_i16[i] = i;
+  return buff_i16;
+}
+
+int8_t *loadA64xi8() {
+  for (int i = 0; i < 64; ++i)
+    buff_i8[i] = i;
+  return buff_i8;
+}
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir
new file mode 100644
index 0000000000..6cd644028c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/kernel.mlir
@@ -0,0 +1,81 @@
+// REQUIRES: valid_xchess_license
+// RUN: aie-opt %s -convert-vector-to-aievec | aie-translate -aievec-to-cpp -o kernel.tmp.cc
+// RUN: echo "#include <cstdint>" > kernel.cc && cat kernel.tmp.cc >> kernel.cc
+// RUN: xchesscc_wrapper aie -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ kernel.cc %S/helplib.cc %S/main.cc
+// RUN: xca_udm_dbg -qf -T -P %aietools/data/versal_prod/lib -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s
+
+func.func private @printv16xi16(%v : vector<16xi16>)
+func.func private @loadA64xi16() -> memref<64xi16>
+
+#map6 = affine_map<(d0) -> (d0 + 6)>
+#map7 = affine_map<(d0) -> (d0 + 7)>
+#map8 = affine_map<(d0) -> (d0 + 8)>
+#map9 = affine_map<(d0) -> (d0 + 9)>
+#map10 = affine_map<(d0) -> (d0 + 10)>
+
+func.func @entry() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i16 = arith.constant 0 : i16
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c8 = arith.constant 8 : index
+  %c9 = arith.constant 9 : index
+  %c10 = arith.constant 10 : index
+  %c11 = arith.constant 11 : index
+  %c12 = arith.constant 12 : index
+  %c13 = arith.constant 13 : index
+  %c14 = arith.constant 14 : index
+  %c15 = arith.constant 15 : index
+
+  %buffi16 = func.call @loadA64xi16() : () -> (memref<64xi16>)
+  %v16 = vector.transfer_read %buffi16[%c0], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%v16) : (vector<16xi16>) -> ()
+
+  %1 = vector.transfer_read %buffi16[%c1], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%1) : (vector<16xi16>) -> ()
+  %2 = vector.transfer_read %buffi16[%c2], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%2) : (vector<16xi16>) -> ()
+  %3 = vector.transfer_read %buffi16[%c3], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%3) : (vector<16xi16>) -> ()
+  %4 = vector.transfer_read %buffi16[%c4], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%4) : (vector<16xi16>) -> ()
+  %5 = vector.transfer_read %buffi16[%c5], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%5) : (vector<16xi16>) -> ()
+
+  %i6 = affine.apply #map6(%c0)
+  %6 = vector.transfer_read %buffi16[%i6], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%6) : (vector<16xi16>) -> ()
+  %i7 = affine.apply #map7(%c0)
+  %7 = vector.transfer_read %buffi16[%i7], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%7) : (vector<16xi16>) -> ()
+  %i8 = affine.apply #map8(%c0)
+  %8 = vector.transfer_read %buffi16[%i8], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%8) : (vector<16xi16>) -> ()
+  %i9 = affine.apply #map9(%c0)
+  %9 = vector.transfer_read %buffi16[%i9], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%9) : (vector<16xi16>) -> ()
+  %i10 = affine.apply #map10(%c0)
+  %10 = vector.transfer_read %buffi16[%i10], %c0_i16 : memref<64xi16>, vector<16xi16>
+  func.call @printv16xi16(%10) : (vector<16xi16>) -> ()
+
+  return %c0_i32 : i32
+}
+
+// CHECK-LABEL: vector<16xi16>[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ]
+// CHECK-LABEL: vector<16xi16>[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ]
+// CHECK-LABEL: vector<16xi16>[ 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ]
+// CHECK-LABEL: vector<16xi16>[ 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ]
+// CHECK-LABEL: vector<16xi16>[ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ]
+// CHECK-LABEL: vector<16xi16>[ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ]
+// CHECK-LABEL: vector<16xi16>[ 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ]
+// CHECK-LABEL: vector<16xi16>[ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 ]
+// CHECK-LABEL: vector<16xi16>[ 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ]
+// CHECK-LABEL: vector<16xi16>[ 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
+// CHECK-LABEL: vector<16xi16>[ 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ]
+// CHECK-LABEL: SUCCESS
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc
new file mode 100644
index 0000000000..a81ee4f46c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i16/main.cc
@@ -0,0 +1,11 @@
+#include <cstdio>
+
+int entry(void);
+
+int main(void) {
+  int r = entry();
+  if (r)
+    printf("ERROR: %d", r);
+  printf("SUCCESS");
+  return r;
+}
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc
new file mode 100644
index 0000000000..c1347a4848
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/helplib.cc
@@ -0,0 +1,48 @@
+#include "aie_api/aie.hpp"
+#include "aie_api/utils.hpp"
+#include <cstdint>
+#include <cstdio>
+
+template <typename T> const char *tid() { return "@"; }
+
+template <> const char *tid<int8_t>() { return "i"; }
+template <> const char *tid<int16_t>() { return "i"; }
+template <> const char *tid<int32_t>() { return "i"; }
+
+template <int nlanes, typename elemtype, typename vtype> void printv(vtype v) {
+  printf("vector<%dx%s%u>[ ", nlanes, tid<elemtype>(), 8 * sizeof(elemtype));
+  aie::print(aie::vector<elemtype, nlanes>(v));
+  printf("]\n");
+}
+
+void printv16xi32(v16int32 v) { printv<16, int32_t>(v); }
+
+void printv8xi32(v8int32 v) { printv<8, int32_t>(v); }
+
+void printv32xi16(v32int16 v) { printv<32, int16_t>(v); }
+
+void printv16xi16(v16int16 v) { printv<16, int16_t>(v); }
+
+void printv32xi8(v32int8 v) { printv<32, int8_t>(v); }
+
+alignas(32) int32_t buff_i32[64];
+alignas(32) int16_t buff_i16[64];
+alignas(32) int8_t buff_i8[64];
+
+int32_t *loadA64xi32() {
+  for (int i = 0; i < 64; ++i)
+    buff_i32[i] = i;
+  return buff_i32;
+}
+
+int16_t *loadA64xi16() {
+  for (int i = 0; i < 64; ++i)
+    buff_i16[i] = i;
+  return buff_i16;
+}
+
+int8_t *loadA64xi8() {
+  for (int i = 0; i < 64; ++i)
+    buff_i8[i] = i;
+  return buff_i8;
+}
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir
new file mode 100644
index 0000000000..670e5c73f1
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/kernel.mlir
@@ -0,0 +1,35 @@
+// REQUIRES: valid_xchess_license
+// RUN: aie-opt %s -convert-vector-to-aievec | aie-translate -aievec-to-cpp -o kernel.tmp.cc
+// RUN: echo "#include <cstdint>" > kernel.cc && cat kernel.tmp.cc >> kernel.cc
+// RUN: xchesscc_wrapper aie -f -g +s +w work +o work -I%S -I. -I%aietools/include -D__AIENGINE__ kernel.cc %S/helplib.cc %S/main.cc
+// RUN: xca_udm_dbg -qf -T -P %aietools/data/versal_prod/lib -t "%S/../../profiling.tcl ./work/a.out" | FileCheck %s
+
+func.func private @printv8xi32(%v : vector<8xi32>)
+func.func private @loadA64xi32() -> memref<64xi32>
+
+#map6 = affine_map<(d0) -> (d0 + 6)>
+
+func.func @entry() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5 : index
+
+  %buffi32 = func.call @loadA64xi32() : () -> (memref<64xi32>)
+
+  %v0 = vector.transfer_read %buffi32[%c0], %c0_i32 : memref<64xi32>, vector<8xi32>
+  func.call @printv8xi32(%v0) : (vector<8xi32>) -> ()
+
+  %v5 = vector.transfer_read %buffi32[%c5], %c0_i32 : memref<64xi32>, vector<8xi32>
+  func.call @printv8xi32(%v5) : (vector<8xi32>) -> ()
+
+  %idx6 = affine.apply #map6(%c0)
+  %v6 = vector.transfer_read %buffi32[%idx6], %c0_i32 : memref<64xi32>, vector<8xi32>
+  func.call @printv8xi32(%v6) : (vector<8xi32>) -> ()
+
+  return %c0_i32 : i32
+}
+
+// CHECK-LABEL: vector<8xi32>[ 0 1 2 3 4 5 6 7 ]
+// CHECK-LABEL: vector<8xi32>[ 5 6 7 8 9 10 11 12 ]
+// CHECK-LABEL: vector<8xi32>[ 6 7 8 9 10 11 12 13 ]
+// CHECK-LABEL: SUCCESS
diff --git a/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc
new file mode 100644
index 0000000000..a81ee4f46c
--- /dev/null
+++ b/test/Integration/Dialect/AIEVec/v1_unaligned_read/i32/main.cc
@@ -0,0 +1,11 @@
+#include <cstdio>
+
+int entry(void);
+
+int main(void) {
+  int r = entry();
+  if (r)
+    printf("ERROR: %d", r);
+  printf("SUCCESS");
+  return r;
+}
diff --git a/tools/aie-opt/aie-opt.cpp b/tools/aie-opt/aie-opt.cpp
index 480f5d7f8a..c20d2d8ba9 100644
--- a/tools/aie-opt/aie-opt.cpp
+++ b/tools/aie-opt/aie-opt.cpp
@@ -25,6 +25,7 @@
 #include "aie/Dialect/ADF/ADFDialect.h"
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h"
+#include "aie/Dialect/AIEVec/Analysis/Passes.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h"
 #include "aie/Dialect/AIEVec/Pipelines/Passes.h"
 #include "aie/Dialect/AIEVec/Transforms/Passes.h"
@@ -40,6 +41,7 @@ int main(int argc, char **argv) {
   xilinx::registerConversionPasses();
   aie::registerAIEPasses();
   xilinx::AIEX::registerAIEXPasses();
+  xilinx::aievec::registerAIEVecAnalysisPasses();
   xilinx::aievec::registerAIEVecPasses();
   xilinx::aievec::registerAIEVecPipelines();