From 0690d7c1110a399bc7e7f25054e3a9780c6f1052 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Fri, 11 Oct 2024 09:15:34 -0700
Subject: [PATCH 1/3] squash post addressing Jorn's review comments

---
 .../samples/matmul_pack_peel_objectfifo.mlir  |   4 +-
 .../AMDAIEDistributeCoresAndObjectFifos.cpp   | 265 +-----------------
 .../AMDAIEDistributeL1Allocations.cpp         | 213 ++++++++++++++
 .../iree-amd-aie/Transforms/AMDAIEDmaUtils.h  |  68 -----
 .../iree-amd-aie/Transforms/CMakeLists.txt    |   1 +
 .../iree-amd-aie/Transforms/PassDetail.h      |   1 +
 .../iree-amd-aie/Transforms/Passes.cpp        |   5 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |   3 +
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |  21 ++
 .../Transforms/test/CMakeLists.txt            |   1 +
 .../distribute_cores_and_objectfifos.mlir     |  89 +-----
 .../test/distribute_l1_allocations.mlir       |  93 ++++++
 12 files changed, 351 insertions(+), 413 deletions(-)
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
 create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
index 59da53759..e902c7e2a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
@@ -1,6 +1,6 @@
-// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. 
+// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run.
 
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
 
 
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
index 650de7624..3a8b28f28 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -5,10 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree-amd-aie/IR/AMDAIEOps.h"
-#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree-amd-aie/Transforms/Transforms.h"
-#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -17,7 +15,6 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Iterators.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -61,258 +58,6 @@ struct LocationMapInfo {
 // AMDAIEDistributeCoresAndObjectFifosPass
 //===----------------------------------------------------------------------===//
 
-/// Try to detect subview(s) that look like they are 'distributing' aka
-/// 'privatizing'. That is, subview(s) that take an L1 memref spanning all
-/// L1 memories of the AIE array, and slice it along tile specific dimensions.
-/// If one or more identical subviews are found, return the MemRefType of
-/// the subview(s). Otherwise, return an empty MemRefType.
-MemRefType getDistributedType(memref::AllocOp alloc) {
-  MemRefType type{};
-  for (Operation *allocUser : alloc->getUsers()) {
-    if (auto subview = dyn_cast<memref::SubViewOp>(allocUser)) {
-      Operation::operand_range offsets = subview.getOffsets();
-
-      // This subview op is contained inside nested scf.for ops. We count how
-      // how many of these loop ops are annotated with amdaie.unroll, and are
-      // sliced on their induction variable. For distributed L2 memory, we
-      // expect this to be exactly 2, and we expect no slicing in other
-      // dimensions. It is possible to handle other edge cases, but this is left
-      // for future work.
-      uint32_t nbNonConstants =
-          std::count_if(offsets.begin(), offsets.end(), [](Value v) -> bool {
-            return !mlir::matchPattern(v, mlir::m_Constant());
-          });
-      if (nbNonConstants != 2) return {};
-      uint32_t nbDistributionLoops{0};
-      scf::ForOp currentOp = subview->getParentOfType<scf::ForOp>();
-      while (currentOp) {
-        Value iv = currentOp.getInductionVar();
-        uint64_t sliceCount = std::count(offsets.begin(), offsets.end(), iv);
-        if (sliceCount > 1) return {};
-        if (sliceCount == 1) {
-          if (!currentOp->hasAttr(kAMDAIELoopUnroll)) return {};
-          ++nbDistributionLoops;
-        }
-        currentOp = currentOp->getParentOfType<scf::ForOp>();
-      }
-      if (nbDistributionLoops != 2) return {};
-      auto nextType = cast<MemRefType>(subview.getResult().getType());
-      if (!type) {
-        type = nextType;
-      } else if (type != nextType) {
-        // This is the case where there are 2+ subview ops which look like
-        // they should be distributing, but they have different result types.
-        // Bail.
-        return {};
-      }
-    }
-  }
-  return type;
-}
-
-/// Distribute local memory accesses through subviews by allocating a single
-/// smaller memory. This is needed because cores can't operate on one larger L1
-/// memory.
-LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
-  SmallVector<Operation *> toBeErased;
-
-  moduleOp->walk([&](memref::AllocOp oldAlloc) {
-    Attribute maybeMemorySpace =
-        cast<MemRefType>(oldAlloc.getResult().getType()).getMemorySpace();
-    if (!maybeMemorySpace) return WalkResult::advance();
-    auto memorySpace = cast<IntegerAttr>(maybeMemorySpace);
-
-    // Only consider local memory (L1).
-    if (memorySpace.getInt() != 2) return WalkResult::advance();
-
-    // Don't try and distribute memory if the alloc is inside a scf.for op.
-    if (auto scfForOp = oldAlloc->getParentOfType<scf::ForOp>())
-      return WalkResult::advance();
-
-    MemRefType memRefType = getDistributedType(oldAlloc);
-
-    // Failed to find a memref.subview that looks like it is distributing.
-    // This doesn't mean that we can't distribute (for example there might be
-    // no subviews at all), but this requires further work.
-    if (!memRefType) return WalkResult::advance();
-
-    ArrayRef<int64_t> newShape = memRefType.getShape();
-    Type elementType = memRefType.getElementType();
-
-    rewriter.setInsertionPoint(oldAlloc);
-    MemRefType newAllocType = MemRefType::get(
-        newShape, elementType, MemRefLayoutAttrInterface{}, memorySpace);
-    auto newAlloc = rewriter.create<memref::AllocOp>(rewriter.getUnknownLoc(),
-                                                     newAllocType);
-    auto newDeallocOp =
-        rewriter.create<memref::DeallocOp>(rewriter.getUnknownLoc(), newAlloc);
-
-    newDeallocOp->moveBefore(&newAlloc->getBlock()->back());
-
-    // Replace uses of the old alloc with the new alloc.
-    for (Operation *userOp : oldAlloc->getUsers()) {
-      LogicalResult switchResult =
-          llvm::TypeSwitch<Operation *, LogicalResult>(userOp)
-              .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
-                rewriter.replaceAllUsesWith(subviewOp, newAlloc);
-                toBeErased.push_back(subviewOp);
-                return success();
-              })
-              .Case<vector::TransferReadOp>(
-                  [&](vector::TransferReadOp transferReadOp) {
-                    rewriter.setInsertionPoint(transferReadOp);
-                    // Since in this function we're basically changing the L1
-                    // sizes of the Alloc, for dimensions with size as 1 we need
-                    // to set the indices as 0. We need to do this at this step
-                    // because there would be loop dependencies on the same and
-                    // when we unroll those loops later in this pass we would
-                    // have incorrect offset values being formed for those
-                    // dimensions.
-                    SmallVector<Value> newIndices = transferReadOp.getIndices();
-                    Value c0 = rewriter.create<arith::ConstantIndexOp>(
-                        transferReadOp.getLoc(), 0);
-                    for (unsigned i = 0, n = newShape.size(); i < n; i++) {
-                      if (newShape[i] == 1) newIndices[i] = c0;
-                    }
-
-                    auto newTransferReadOp =
-                        rewriter.create<vector::TransferReadOp>(
-                            transferReadOp.getLoc(), transferReadOp.getType(),
-                            newAlloc, newIndices,
-                            transferReadOp.getPermutationMapAttr(),
-                            transferReadOp.getPadding(),
-                            transferReadOp.getMask(),
-                            transferReadOp.getInBoundsAttr());
-                    rewriter.replaceAllUsesWith(transferReadOp,
-                                                newTransferReadOp.getResult());
-                    toBeErased.push_back(transferReadOp);
-                    return success();
-                  })
-              .Case<vector::TransferWriteOp>(
-                  [&](vector::TransferWriteOp transferWriteOp) {
-                    rewriter.setInsertionPoint(transferWriteOp);
-                    // Since in this function we're basically changing the L1
-                    // sizes of the Alloc, for dimensions with size as 1 we need
-                    // to set the indices as 0. We need to do this at this step
-                    // because there would be loop dependencies on the same and
-                    // when we unroll those loops later in this pass we would
-                    // have incorrect offset values being formed for those
-                    // dimensions.
-                    SmallVector<Value> newIndices =
-                        transferWriteOp.getIndices();
-                    Value c0 = rewriter.create<arith::ConstantIndexOp>(
-                        transferWriteOp.getLoc(), 0);
-
-                    for (unsigned i = 0, n = newShape.size(); i < n; i++) {
-                      if (newShape[i] == 1) newIndices[i] = c0;
-                    }
-
-                    rewriter.create<vector::TransferWriteOp>(
-                        transferWriteOp.getLoc(), transferWriteOp.getVector(),
-                        newAlloc, newIndices,
-                        transferWriteOp.getPermutationMapAttr(),
-                        transferWriteOp.getMask(),
-                        transferWriteOp.getInBoundsAttr());
-                    toBeErased.push_back(transferWriteOp);
-                    return success();
-                  })
-              .Case<memref::ExtractStridedMetadataOp>(
-                  [&](memref::ExtractStridedMetadataOp
-                          extractStridedMetadataOp) {
-                    rewriter.setInsertionPoint(extractStridedMetadataOp);
-                    auto newextractStridedMetadataOp =
-                        rewriter.create<memref::ExtractStridedMetadataOp>(
-                            extractStridedMetadataOp.getLoc(), newAlloc);
-                    rewriter.replaceAllUsesWith(
-                        extractStridedMetadataOp.getResults(),
-                        newextractStridedMetadataOp.getResults());
-                    toBeErased.push_back(extractStridedMetadataOp);
-                    return success();
-                  })
-              .Case<memref::DeallocOp>([&](memref::DeallocOp deallocOp) {
-                toBeErased.push_back(userOp);
-                return success();
-              })
-              .Case<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-                  [&rewriter, &newAlloc, &toBeErased](
-                      AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
-                    auto type = llvm::cast<MemRefType>(newAlloc.getType());
-
-                    // Collect all DmaCpyNdOps which have 'logicalObjectFifo' as
-                    // a source. Currently not handling the case of multiple.
-                    SmallVector<AMDAIE::DmaCpyNdOp> dmaOps;
-                    for (Operation *objFifoUserOp :
-                         logicalObjectFifo->getUsers()) {
-                      if (auto dmaOp =
-                              dyn_cast<AMDAIE::DmaCpyNdOp>(objFifoUserOp);
-                          dmaOp.getSourceObjectFifo() == logicalObjectFifo) {
-                        dmaOps.push_back(dmaOp);
-                      }
-                    }
-                    if (dmaOps.size() == 0) return success();
-                    if (dmaOps.size() > 1) {
-                      logicalObjectFifo->emitOpError(
-                          "Case of multiple DMA ops not handled yet (easy "
-                          "extension to logic here)");
-                      return failure();
-                    }
-                    AMDAIE::DmaCpyNdOp dmaOp = dmaOps[0];
-
-                    SmallVector<Value> empty;
-                    rewriter.setInsertionPoint(logicalObjectFifo);
-                    auto source =
-                        rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-                            rewriter.getUnknownLoc(),
-                            LogicalObjectFifoType::get(type),
-                            newAlloc.getResult());
-                    rewriter.replaceAllUsesWith(logicalObjectFifo, source);
-                    toBeErased.push_back(logicalObjectFifo);
-                    rewriter.setInsertionPoint(dmaOp);
-                    auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
-                        dmaOp.getLoc(), dmaOp.getTarget(),
-                        dmaOp.getTargetMixedOffsets(),
-                        dmaOp.getTargetMixedSizes(),
-                        dmaOp.getTargetMixedStrides(), source,
-                        dmaOp.getSourceMixedOffsets(),
-                        dmaOp.getSourceMixedSizes(),
-                        dmaOp.getSourceMixedStrides());
-                    rewriter.replaceAllUsesWith(dmaOp, newDmaOp);
-                    // TODO: maybe this should be left to a DCE somewhere,
-                    // instead of manually erasing unused ops?
-                    toBeErased.push_back(dmaOp);
-                    // We have to discard non-zero offsets as subview has
-                    // been replaced by a dedicated allocated memref.
-                    SmallVector<int64_t> allocShape(type.getShape());
-                    (void)discardAllNonZeroOffsets<CopyOpOperateOn::Source>(
-                        rewriter,
-                        cast<AMDAIE::DoublyStridedOpInterface>(
-                            newDmaOp.getOperation()),
-                        allocShape);
-                    return success();
-                  })
-              .Default([&](Operation *userOp) {
-                userOp->emitOpError(
-                    "needs to have logic implemented for handling in "
-                    "distributeLocalMemory");
-                return failure();
-              });
-
-      if (failed(switchResult)) return WalkResult::interrupt();
-    }
-    toBeErased.push_back(oldAlloc);
-
-    return WalkResult::advance();
-  });
-
-  for (Operation *op : toBeErased) {
-    op->dropAllUses();
-    rewriter.eraseOp(op);
-  }
-
-  return success();
-}
-
 /// Convert inner scf.forall ops chosen for parallel distribution to scf.for
 /// ops.
 LogicalResult localForallToFor(ModuleOp moduleOp) {
@@ -649,8 +394,7 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
       }
       for (auto &&[idx, operand] : llvm::enumerate(op->getOpOperands())) {
         Operation *operandDefiningOp = operand.get().getDefiningOp();
-        if (!dyn_cast_if_present<memref::AllocOp>(operandDefiningOp))
-          continue;
+        if (!dyn_cast_if_present<memref::AllocOp>(operandDefiningOp)) continue;
         if (memrefToLogicalObjectFifoAccess.contains(operand.get())) {
           op->setOperand(idx, memrefToLogicalObjectFifoAccess[operand.get()]);
         } else if (memrefToLogicalObjectFifo.contains(operand.get())) {
@@ -956,13 +700,6 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
   LLVM_DEBUG(llvm::dbgs() << "Module after localForallToFor: \n"
                           << moduleOp << "\n");
 
-  if (failed(distributeLocalMemory(moduleOp))) {
-    moduleOp.emitOpError() << "local memory distribution failed";
-    return signalPassFailure();
-  }
-  LLVM_DEBUG(llvm::dbgs() << "Module after distributeLocalMemory: \n"
-                          << moduleOp << "\n");
-
   if (failed(verify(moduleOp, true))) {
     return signalPassFailure();
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
new file mode 100644
index 000000000..63e797356
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
@@ -0,0 +1,213 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
+
+#define DEBUG_TYPE "iree-amdaie-distribute-l1-allocations"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+using namespace mlir;
+
+namespace {
+
+/// Find all induction variables of all `scf.forall` ops that are mapped to
+/// gpu thread dimensions (as opposed to gpu block dimensions etc).
+FailureOr<DenseSet<Value>> getThreadIndVars(ModuleOp moduleOp) {
+  DenseSet<Value> threadIndVars;
+  moduleOp.walk([&](scf::ForallOp forallOp) {
+    std::optional<ArrayAttr> maybeMapping = forallOp.getMapping();
+    if (!maybeMapping) return WalkResult::advance();
+    SmallVector<Attribute> mapping = llvm::to_vector(maybeMapping->getValue());
+    if (mapping.empty()) return WalkResult::advance();
+    if (!isa<gpu::GPUThreadMappingAttr>(*mapping.begin()))
+      return WalkResult::advance();
+    for (Value indVar : forallOp.getInductionVars()) {
+      threadIndVars.insert(indVar);
+    }
+    return WalkResult::advance();
+  });
+  return threadIndVars;
+}
+
+/// Try to detect subview(s) that look like they're 'distributing' L1 memory.
+/// That is: they slice the L1 memory along thread/tile dimensions.
+MemRefType getDistributedType(memref::AllocOp alloc,
+                              const DenseSet<Value> &indVars) {
+  MemRefType type;
+  for (Operation *allocUser : alloc->getUsers()) {
+    if (auto subview = dyn_cast<memref::SubViewOp>(allocUser)) {
+      // Check that all offsets are either constants or thread ids. We assume
+      // that if a subview has an offset which is not a constant and not a
+      // thread id, it's not 'distributing'.
+      Operation::operand_range offsets = subview.getOffsets();
+      for (Value offset : offsets) {
+        bool isConst = matchPattern(offset, m_Constant());
+        bool isIndVar = llvm::is_contained(indVars, offset);
+        if (!isConst && !isIndVar) return {};
+      }
+
+      auto nextType = cast<MemRefType>(subview.getResult().getType());
+      if (!type) {
+        type = nextType;
+      } else if (type != nextType) {
+        // This is the case where there are 2+ subview ops which look like
+        // they should be distributing, but they have different result types.
+        // Bail.
+        return {};
+      }
+    }
+  }
+  return type;
+}
+
+/// Create a copy of `toUpdate` with all values in `toRemove` replaced by
+/// `replacement`.
+template <typename Container>
+SmallVector<Value> substitute(Container toUpdate,
+                              const DenseSet<Value> &toRemove,
+                              Value replacement) {
+  SmallVector<Value> updated(toUpdate.begin(), toUpdate.end());
+  for (Value &v : updated) {
+    if (toRemove.contains(v)) v = replacement;
+  }
+  return updated;
+}
+
+/// Distribute local memory accesses through subviews by allocating a single,
+/// smaller memory. This is ultimately needed because cores can't operate on
+/// one shared L1 memory.
+LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
+  FailureOr<DenseSet<Value>> maybeIndVars = getThreadIndVars(moduleOp);
+  if (failed(maybeIndVars)) return failure();
+  const DenseSet<Value> &indVars = maybeIndVars.value();
+  IRRewriter rewriter(moduleOp.getContext());
+  moduleOp->walk([&](memref::AllocOp oldAlloc) {
+    // Only consider local memory (L1).
+    Attribute maybeMemorySpace = oldAlloc.getType().getMemorySpace();
+    if (!maybeMemorySpace) return WalkResult::advance();
+    auto memorySpace = cast<IntegerAttr>(maybeMemorySpace);
+    if (memorySpace.getInt() != 2) return WalkResult::advance();
+
+    // Don't try and distribute memory if the alloc is inside a scf.for op.
+    if (auto scfForOp = oldAlloc->getParentOfType<scf::ForOp>())
+      return WalkResult::advance();
+
+    MemRefType memRefType = getDistributedType(oldAlloc, indVars);
+
+    // Failed to find a memref.subview that looks like it is distributing.
+    // This doesn't mean that we can't distribute (for example there might be
+    // no subviews at all), but this requires further work.
+    if (!memRefType) return WalkResult::advance();
+
+    ArrayRef<int64_t> newShape = memRefType.getShape();
+    Type elementType = memRefType.getElementType();
+
+    rewriter.setInsertionPoint(oldAlloc);
+    MemRefType newAllocType = MemRefType::get(
+        newShape, elementType, MemRefLayoutAttrInterface{}, memorySpace);
+    auto newAlloc = rewriter.create<memref::AllocOp>(rewriter.getUnknownLoc(),
+                                                     newAllocType);
+
+    const SmallVector<Operation *> users(oldAlloc->user_begin(),
+                                         oldAlloc->user_end());
+
+    // Replace uses of the old alloc with the new alloc.
+    for (Operation *user : users) {
+      LogicalResult switchResult =
+          llvm::TypeSwitch<Operation *, LogicalResult>(user)
+              .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
+                rewriter.replaceAllUsesWith(subviewOp, newAlloc);
+                return success();
+              })
+              .Case<vector::TransferReadOp>([&](vector::TransferReadOp readOp) {
+                rewriter.setInsertionPoint(readOp);
+                Value c0 =
+                    rewriter.create<arith::ConstantIndexOp>(readOp.getLoc(), 0);
+                SmallVector<Value> indices =
+                    substitute(readOp.getIndices(), indVars, c0);
+                rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+                    readOp, readOp.getType(), newAlloc, indices,
+                    readOp.getPermutationMapAttr(), readOp.getPadding(),
+                    readOp.getMask(), readOp.getInBoundsAttr());
+                return success();
+              })
+              .Case<vector::TransferWriteOp>(
+                  [&](vector::TransferWriteOp writeOp) {
+                    rewriter.setInsertionPoint(writeOp);
+                    Value c0 = rewriter.create<arith::ConstantIndexOp>(
+                        writeOp.getLoc(), 0);
+                    SmallVector<Value> indices =
+                        substitute(writeOp.getIndices(), indVars, c0);
+                    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+                        writeOp, writeOp.getVector(), newAlloc, indices,
+                        writeOp.getPermutationMapAttr(), writeOp.getMask(),
+                        writeOp.getInBoundsAttr());
+                    return success();
+                  })
+              .Case<memref::ExtractStridedMetadataOp>(
+                  [&](memref::ExtractStridedMetadataOp
+                          extractStridedMetadataOp) {
+                    rewriter
+                        .replaceOpWithNewOp<memref::ExtractStridedMetadataOp>(
+                            extractStridedMetadataOp, newAlloc);
+                    return success();
+                  })
+              .Case<memref::DeallocOp>([&](memref::DeallocOp deallocOp) {
+                rewriter.setInsertionPoint(deallocOp);
+                rewriter.create<memref::DeallocOp>(rewriter.getUnknownLoc(),
+                                                   newAlloc);
+                return success();
+              })
+              .Default([&](Operation *user) {
+                user->emitOpError("needs logic implemented for handling.");
+                return failure();
+              });
+
+      if (failed(switchResult)) return WalkResult::interrupt();
+    }
+
+    return WalkResult::advance();
+  });
+
+  return success();
+}
+
+class AMDAIEDistributeL1AllocationsPass
+    : public impl::AMDAIEDistributeL1AllocationsBase<
+          AMDAIEDistributeL1AllocationsPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  AMDAIEDistributeL1AllocationsPass() = default;
+  AMDAIEDistributeL1AllocationsPass(
+      const AMDAIEDistributeL1AllocationsPass &pass){};
+  void runOnOperation() override;
+};
+
+void AMDAIEDistributeL1AllocationsPass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  if (failed(distributeLocalMemory(moduleOp))) return signalPassFailure();
+}
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEDistributeL1AllocationsPass() {
+  return std::make_unique<AMDAIEDistributeL1AllocationsPass>();
+}
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
index dc9f88c23..567c46bea 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h
@@ -154,74 +154,6 @@ LogicalResult foldUnitDims(const SmallVector<OpFoldResult> &offsets,
                            SmallVector<OpFoldResult> &newStrides,
                            SmallVector<OpFoldResult> &newSizes);
 
-/// Utility to discard all non-zero offsets that have dimension equal to 1 on
-/// the same index of the provided shape. This helps with updating DMA
-/// operations for a shape change. If an empty shape is passed, all non-zero
-/// offsets will be removed.
-template <CopyOpOperateOn OperateOn>
-AMDAIE::DoublyStridedOpInterface discardAllNonZeroOffsets(
-    RewriterBase &rewriter, AMDAIE::DoublyStridedOpInterface op,
-    SmallVector<int64_t> &shape) {
-  SmallVector<OpFoldResult> newSourceOffsets;
-  SmallVector<OpFoldResult> newSourceSizes;
-  SmallVector<OpFoldResult> newSourceStrides;
-  SmallVector<OpFoldResult> newTargetOffsets;
-  SmallVector<OpFoldResult> newTargetSizes;
-  SmallVector<OpFoldResult> newTargetStrides;
-  if constexpr (OperateOn == CopyOpOperateOn::Source) {
-    SmallVector<OpFoldResult> offsets = op.getSourceMixedOffsets();
-    SmallVector<OpFoldResult> sizes = op.getSourceMixedSizes();
-    SmallVector<OpFoldResult> strides = op.getSourceMixedStrides();
-    // Set shape to a vector of ones as a default.
-    if (shape.empty()) {
-      SmallVector<int64_t> ones(offsets.size(), 1);
-      shape = ones;
-    }
-    if (shape.size() != offsets.size()) return op;
-    // Fill source offsets/sizes/strides.
-    for (auto &&[offset, size, stride, dim] :
-         llvm::zip(offsets, sizes, strides, shape)) {
-      std::optional<int64_t> constantOffset = getConstantIntValue(offset);
-      if (dim == 1 && !constantOffset) continue;
-      if (dim == 1 && constantOffset && constantOffset.value() != 0) continue;
-      newSourceOffsets.push_back(offset);
-      newSourceSizes.push_back(size);
-      newSourceStrides.push_back(stride);
-    }
-    newTargetOffsets = op.getTargetMixedOffsets();
-    newTargetSizes = op.getTargetMixedSizes();
-    newTargetStrides = op.getTargetMixedStrides();
-  } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
-    SmallVector<OpFoldResult> offsets = op.getTargetMixedOffsets();
-    SmallVector<OpFoldResult> sizes = op.getTargetMixedSizes();
-    SmallVector<OpFoldResult> strides = op.getTargetMixedStrides();
-    // Set shape to a vector of ones as a default.
-    if (shape.empty()) {
-      SmallVector<int64_t> ones(offsets.size(), 1);
-      shape = ones;
-    }
-    if (shape.size() != offsets.size()) return op;
-    // Fill source offsets/sizes/strides.
-    for (auto &&[offset, size, stride, dim] :
-         llvm::zip(offsets, sizes, strides, shape)) {
-      std::optional<int64_t> constantOffset = getConstantIntValue(offset);
-      if (dim == 1 && !constantOffset) continue;
-      if (dim == 1 && constantOffset && constantOffset.value() != 0) continue;
-      newTargetOffsets.push_back(offset);
-      newTargetSizes.push_back(size);
-      newTargetStrides.push_back(stride);
-    }
-    newSourceOffsets = op.getSourceMixedOffsets();
-    newSourceSizes = op.getSourceMixedSizes();
-    newSourceStrides = op.getSourceMixedStrides();
-  }
-  rewriter.setInsertionPointAfter(op);
-  auto newDoublyStridedOp = op.createDoublyStridedOp(
-      rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
-      newSourceOffsets, newSourceSizes, newSourceStrides);
-  rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
-  return newDoublyStridedOp;
-}
 
 /// Utility DMA configuration which is calculated based on AMDAIEDeviceModel
 /// information.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index d2a7cb0e2..a467ce00e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -63,6 +63,7 @@ iree_cc_library(
     "AMDAIECreateAIEWorkgroup.cpp"
     "AMDAIECreateReferenceToAllocation.cpp"
     "AMDAIEDistributeCoresAndObjectFifos.cpp"
+    "AMDAIEDistributeL1Allocations.cpp"
     "AMDAIEDmaCSE.cpp"
     "AMDAIEDmaComposition.cpp"
     "AMDAIEDmaLoopSubsumption.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 7c7105b0b..4cd5586f0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -41,6 +41,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION
 #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR
 #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS
+#define GEN_PASS_DEF_AMDAIEDISTRIBUTEL1ALLOCATIONS
 #define GEN_PASS_DEF_AMDAIEDMACOMPOSITION
 #define GEN_PASS_DEF_AMDAIEDMACSE
 #define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 454d54ae9..4bc7c8bc4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -545,6 +545,11 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
                                        TilePassPipeline useTilePipeline) {
   passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
+
+  passManager.addPass(createAMDAIEDistributeL1AllocationsPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createCSEPass());
+
   passManager.addPass(createCanonicalizerPass());
   // For matmul pipelines, we do transpose on target side for pack ops to get
   // better performance. While for convolution pipelines, the same settings
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index 00aa88694..df670e19f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -138,6 +138,9 @@ std::unique_ptr<Pass> createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass();
 /// operations and distribute the logical objectFifos.
 std::unique_ptr<Pass> createAMDAIEDistributeCoresAndObjectFifosPass();
 
+/// Create pass to distribute/privatize/localize memory alloocations in L1 memory
+std::unique_ptr<Pass> createAMDAIEDistributeL1AllocationsPass();
+
 /// Create a pass to compose more complex DMA operations, e.g. by combining DMA
 /// operations and/or subsuming loop iterations into the strided access
 /// patterns.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index e6910bbde..7c8364fed 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -182,6 +182,27 @@ def AMDAIEDistributeCoresAndObjectFifos :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()";
 }
 
+def AMDAIEDistributeL1Allocations :
+  Pass<"iree-amdaie-distribute-l1-allocations", "ModuleOp"> {
+  let summary = "Replace distributed L1 allocations with private allocations.";
+  let description = [{
+    Each AIE core/tile is uniquely identified by gpu thread ids, usually
+    'y' (for AIE row) and 'x' (for AIE column).
+
+    Some of the compilation pipelines in iree-amd-aie generate a single L1
+    memory allocation describing the concatenation of all memory for all
+    cores/tiles. Each thread then slices into a mutually exclusive rectangle
+    of the allocation, along its thread dimensions, so 'privatizing' its
+    memory.
+
+    This pass rewrites these allocations to be private to each core/tile. So
+    it replaces a large allocation in L1 with a smaller allocation, smaller by
+    a factor of the number of cores/threads.
+  }];
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeL1AllocationsPass()";
+}
+
+
 def AMDAIEDmaComposition :
   Pass<"iree-amdaie-dma-composition"> {
   let summary = "Compose DMA operations by DMA combination and loop subsumption.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index c083125bd..570e66b83 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -28,6 +28,7 @@ iree_lit_test_suite(
     "create_reference_to_allocation.mlir"
     "disable_vectorization.mlir"
     "distribute_cores_and_objectfifos.mlir"
+    "distribute_l1_allocations.mlir"
     "dma_composition.mlir"
     "dma_cse.mlir"
     "dma_loop_subsumption_circular.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
index 6daf18fc9..e36dcf26c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
@@ -1,3 +1,4 @@
+// R UN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations,iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // Check for unrolling an amdaie.core within a parallel loop with a single
@@ -571,76 +572,6 @@ module {
   }
 }
 
-// -----
-
-// Ensure subviews on local memrefs inside cores are handled correctly by discarding the consuming DMAs' non-zero offsets.
-// CHECK-LABEL: @local_subview_output
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-// CHECK-DAG:   %[[ALLOC_0:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2>
-// CHECK-DAG:   %[[ALLOC_1:.+]] = memref.alloc() : memref<2x2x32x32xi32, 1>
-// CHECK-DAG:   %[[ALLOC_2:.+]] = memref.alloc() : memref<64x64xi32>
-// CHECK:       scf.forall (%{{.+}}, %[[ARG1:.+]]) in (2, 2)
-// CHECK-DAG:     %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:     %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]])
-// CHECK-DAG:     %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:     %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]])
-// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]]}
-// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_2]]}
-// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_3]]}
-// CHECK-DAG:     %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_3]]}
-// CHECK-DAG:     %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
-// CHECK-DAG:     %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_0]]}
-// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_0]][0, 0] [32, 32] [32, 1]
-// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]], in : [], out : [%[[DMA_0]]])
-// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_0]], Write)
-// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_1]][0, 0] [32, 32] [32, 1]
-// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]], in : [], out : [%[[DMA_1]]])
-// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Write)
-// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_2]][0, 0] [32, 32] [32, 1]
-// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]], in : [], out : [%[[DMA_2]]])
-// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Write)
-// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-// CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_3]][0, 0] [32, 32] [32, 1]
-// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [], out : [%[[DMA_3]]])
-// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Write)
-// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<1x1x32x32xi32, 2>)
-module {
-  func.func @local_subview_output() {
-    %c0_i32 = arith.constant 0 : i32
-    %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 2>
-    %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1>
-    %alloc_2 = memref.alloc() : memref<64x64xi32>
-    scf.forall (%arg0, %arg1) in (2, 2) {
-      %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 2> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 2>>
-      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x2x32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>
-      %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<64x64xi32> -> !amdaie.logicalobjectfifo<memref<64x64xi32>>
-      scf.forall (%arg2, %arg3) in (2, 2) {
-        %subview = memref.subview %alloc_0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>
-        %8 = amdaie.dma_cpy_nd(%1[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %0[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 32, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 2>>)
-        %add = arith.addi %arg2, %c2 : index
-        %tile = amdaie.tile(%arg3, %add)
-        %core = amdaie.core(%tile, in : [], out : [%8]) {
-          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>)
-          amdaie.end
-        }
-      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-      %9 = amdaie.dma_cpy_nd(%2[%arg1] [%c1] [%c1], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<64x64xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>)
-    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    memref.dealloc %alloc_2 : memref<64x64xi32>
-    memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1>
-    memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 2>
-    return
-  }
-}
 
 // -----
 
@@ -662,19 +593,19 @@ module {
 func.func @l1_temporary_buffer_for_matmul_elem() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
-    %alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
+    %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     scf.forall (%arg0, %arg1) in (1, 1) {
         scf.forall (%arg2, %arg3) in (1, 1) {
-        %subview = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
+        %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32>
         %26 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %26)
         %27 = amdaie.core(%tile, in : [], out : []) {
-            linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
+            linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>)
             amdaie.end
         }
         } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
     return
 }
 
@@ -1013,27 +944,27 @@ module {
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
     %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
-    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     scf.forall (%arg0, %arg1) in (1, 1) {
       %13 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
       %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
       scf.forall (%arg2, %arg3) in (1, 1) {
         %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
         %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
-        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
+        %subview = memref.subview %alloc_3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32>
         %21 = arith.addi %arg2, %c2 : index
         %tile = amdaie.tile(%arg3, %21)
         %22 = amdaie.core(%tile, in : [%19, %20], out : []) {
-          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
+          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>)
           %base_buffer, %offset, %sizes:6, %strides:6 = memref.extract_strided_metadata %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
           %base_buffer_5, %offset_6, %sizes_7:6, %strides_8:6 = memref.extract_strided_metadata %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
-          %base_buffer_9, %offset_10, %sizes_11:6, %strides_12:6 = memref.extract_strided_metadata %subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
+          %base_buffer_9, %offset_10, %sizes_11:6, %strides_12:6 = memref.extract_strided_metadata %subview : memref<1x1x8x8x4x4xi32, 2 : i32> -> memref<i32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index
           func.call @matmul_i32_i32(%base_buffer, %c0, %base_buffer_5, %c0, %base_buffer_9, %offset_10) : (memref<i32, 2 : i32>, index, memref<i32, 2 : i32>, index, memref<i32, 2 : i32>, index) -> ()
           amdaie.end
         } {elf_file = "/path/to/ukernel.o"}
       } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
+    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
     memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
new file mode 100644
index 000000000..1d3c38d7a
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
@@ -0,0 +1,93 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: distribute_l1_memory_test_0
+
+// The L2 allocation becomes private to each thread:
+// CHECK: %[[L2ALLOC:.+]] = memref.alloc() : memref<1x1x32x32xi32, 2>
+
+// The linalg.fill acts directly on the private allocation, not a view of the
+// shared allocation:
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>)
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[L2ALLOC]] : memref<1x1x32x32xi32, 2>)
+// CHECK: memref.dealloc %[[L2ALLOC]] : memref<1x1x32x32xi32, 2>
+
+func.func @distribute_l1_memory_test_0() {
+  %c0_i32 = arith.constant 0 : i32
+  %alloc = memref.alloc() : memref<2x2x32x32xi32, 2>
+  scf.forall (%arg2, %arg3) in (2, 2) {
+    %subview = memref.subview %alloc[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>)
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  scf.forall (%arg2, %arg3) in (2, 2) {
+    %subview = memref.subview %alloc[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 2> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 2>)
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  memref.dealloc %alloc : memref<2x2x32x32xi32, 2>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transfer_read_test()
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2>
+// CHECK: vector.transfer_read %[[ALLOC]]
+// CHECK-SAME: memref<1x8xbf16, 2>, vector<1x8xbf16>
+
+func.func @transfer_read_test(){
+  %alloc = memref.alloc() : memref<4x8xbf16, 2>
+  scf.forall (%arg0) in (4) {
+    %c0 = arith.constant 0 : index
+    %c0_bf16 = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %alloc[%arg0, 0] [1, 8] [1, 1] :
+    memref<4x8xbf16, 2> to memref<1x8xbf16, strided<[8, 1], offset: ?>, 2>
+    %vector = vector.transfer_read %subview[%c0, %c0], %c0_bf16 {in_bounds = [true, true]} :
+    memref<1x8xbf16, strided<[8, 1], offset: ?>, 2>, vector<1x8xbf16>
+  } {mapping = [#gpu.thread<x>]}
+  return
+}
+
+// -----
+
+// CHECK: @transfer_write_test(%[[VECTOR:.+]]: vector<1x8xbf16>)
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x8xbf16, 2>
+// CHECK: scf.forall
+// CHECK: vector.transfer_write %[[VECTOR]], %[[ALLOC]]
+// CHECK-SAME: vector<1x8xbf16>, memref<1x8xbf16, 2>
+
+func.func @transfer_write_test(%vector : vector<1x8xbf16>){
+  %alloc = memref.alloc() : memref<4x8xbf16, 2>
+  scf.forall (%arg0) in (4) {
+    %c0 = arith.constant 0 : index
+    %c0_bf16 = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %alloc[%arg0, 0] [1, 8] [1, 1] :
+    memref<4x8xbf16, 2> to memref<1x8xbf16, strided<[8, 1], offset: ?>, 2>
+    vector.transfer_write %vector, %subview[%c0, %c0] {} : vector<1x8xbf16>, memref<1x8xbf16, strided<[8, 1], offset: ?>, 2>
+  } {mapping = [#gpu.thread<x>]}
+  return
+}
+
+// -----
+
+// Example where the subview cannot be determined to be distributing:
+
+// CHECK-LABEL: @non_distributing_subview
+// CHECK-NOT: memref.alloc() : memref<1x4xbf16, 2>
+// CHECK: return
+
+func.func @non_distributing_subview(%index : index) {
+  %alloc = memref.alloc() : memref<4x8xbf16, 2>
+  scf.forall (%arg0) in (4) {
+    %c0 = arith.constant 0 : index
+    %c0_bf16 = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %alloc[%arg0, %index] [1, 4] [1, 1] :
+    memref<4x8xbf16, 2> to memref<1x4xbf16, strided<[8, 1], offset: ?>, 2>
+  } {mapping = [#gpu.thread<x>]}
+  return
+}
+
+
+

From 725bdc91273967ddb44178b7f02ede593a3405bb Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Fri, 11 Oct 2024 09:18:55 -0700
Subject: [PATCH 2/3] for loop {}

---
 .../iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
index 63e797356..710d0ddfb 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
@@ -36,9 +36,8 @@ FailureOr<DenseSet<Value>> getThreadIndVars(ModuleOp moduleOp) {
     if (mapping.empty()) return WalkResult::advance();
     if (!isa<gpu::GPUThreadMappingAttr>(*mapping.begin()))
       return WalkResult::advance();
-    for (Value indVar : forallOp.getInductionVars()) {
+    for (Value indVar : forallOp.getInductionVars())
       threadIndVars.insert(indVar);
-    }
     return WalkResult::advance();
   });
   return threadIndVars;

From ff1b9c107db5fc7eb5366b1fe5c6a4bfc9f74c82 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Fri, 11 Oct 2024 11:18:58 -0700
Subject: [PATCH 3/3] remove old line

---
 .../Transforms/test/distribute_cores_and_objectfifos.mlir        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
index e36dcf26c..b00ccb6b0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
@@ -1,4 +1,3 @@
-// R UN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-l1-allocations,iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // Check for unrolling an amdaie.core within a parallel loop with a single