Merge branch 'main' into lyq/update-torch-mlir

bytedance · Aug 14, 2024 · d1b448f · d1b448f
2 parents c2b350c + 3b1ad12
commit d1b448f
Show file tree

Hide file tree

Showing 11 changed files with 69 additions and 41 deletions.
diff --git a/.github/workflows/daily_ci.yaml b/.github/workflows/daily_ci.yaml
@@ -39,5 +39,5 @@ jobs:
       - name: Checkout byteir repo
         uses: actions/checkout@v3
       - name: Build and test e2e
-        run: ./scripts/e2e/build_and_test_e2e.sh
+        run: ./tests/build_and_test_e2e.sh
         shell: bash
diff --git a/compiler/include/byteir/Dialect/MemRef/Transforms/RemoveCopy.h b/compiler/include/byteir/Dialect/MemRef/Transforms/RemoveCopy.h
@@ -30,7 +30,8 @@ class FuncOp;
 } // namespace func
 
 void populateRemoveCopyAfterBufferizationPattern(RewritePatternSet &patterns,
-                                                 DominanceInfo &domInfo);
+                                                 DominanceInfo &domInfo,
+                                                 bool enableByreAlias);
 
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveCopyPass();
 

diff --git a/compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp b/compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp
@@ -16,9 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "byteir/Dialect/MemRef/Transforms/RemoveCopy.h"
+#include "byteir/Dialect/Byre/ByreDialect.h"
 #include "byteir/Dialect/MemRef/Utils/MemEffect.h"
 #include "byteir/Utils/Hoist.h"
 #include "byteir/Utils/MemUtils.h"
+#include "byteir/Utils/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -144,8 +146,10 @@ int64_t extractOffset(MemRefType memref) {
 
 class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
 public:
-  RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom)
-      : OpRewritePattern(context), domInfo(dom) {}
+  RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom,
+                    bool enableByreAlias)
+      : OpRewritePattern(context), domInfo(dom),
+        enableByreAlias(enableByreAlias) {}
 
   LogicalResult matchAndRewrite(memref::CopyOp copyOp,
                                 PatternRewriter &rewriter) const override {
@@ -286,6 +290,8 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
 
         auto sourceMemref = src.getType().cast<MemRefType>();
         auto targetMemref = target.getType().cast<MemRefType>();
+        // target generated by memref.alloc(), it must be identity.
+        assert(targetMemref.getLayout().isIdentity());
         int64_t srcMemrefOffset = 0;
         int64_t tgtMemrefOffset = 0;
         SmallVector<int64_t> srcStrides;
@@ -310,8 +316,13 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
                 copyOp.getLoc(), targetMemref, src, tgtMemrefOffset,
                 targetMemref.getShape(), tgtStrides);
           } else {
-            // TODO: use some op like memref.reinterpret_cast to handle offset
-            return failure();
+            if (this->enableByreAlias) {
+              // use byre.alias to decouple offset from memref type
+              srcCast = rewriter.create<byre::AliasOp>(
+                  copyOp.getLoc(), targetMemref, src, srcMemrefOffset);
+            } else {
+              return failure();
+            }
           }
         } else {
           srcCast = rewriter.create<memref::CastOp>(copyOp.getLoc(),
@@ -392,6 +403,7 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
 
 private:
   DominanceInfo &domInfo;
+  bool enableByreAlias;
 };
 
 struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
@@ -400,10 +412,17 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
   void runOnOperation() override {
 
     func::FuncOp funcOp = getOperation();
+    bool isByreEntryFunc =
+        funcOp->hasAttrOfType<UnitAttr>(
+            byre::ByreDialect::getEntryPointFunctionAttrName()) ||
+        funcOp->hasAttrOfType<UnitAttr>(getAttrPlaceholderName(
+            byre::ByreDialect::getEntryPointFunctionAttrName()));
+
     auto &domInfo = getAnalysis<DominanceInfo>();
     auto &ctx = getContext();
     RewritePatternSet patterns(&ctx);
-    populateRemoveCopyAfterBufferizationPattern(patterns, domInfo);
+    populateRemoveCopyAfterBufferizationPattern(
+        patterns, domInfo, /*enableByreAlias=*/isByreEntryFunc);
 
     // also insert related canonicalizer
     memref::AllocOp::getCanonicalizationPatterns(patterns, &ctx);
@@ -429,8 +448,9 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
 } // namespace
 
 void mlir::populateRemoveCopyAfterBufferizationPattern(
-    RewritePatternSet &patterns, DominanceInfo &domInfo) {
-  patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo);
+    RewritePatternSet &patterns, DominanceInfo &domInfo, bool enableByreAlias) {
+  patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo,
+                                  enableByreAlias);
 }
 
 std::unique_ptr<OperationPass<func::FuncOp>> mlir::createRemoveCopyPass() {

diff --git a/compiler/test/Dialect/MemRef/removeCopy.mlir b/compiler/test/Dialect/MemRef/removeCopy.mlir
@@ -667,3 +667,23 @@ func.func @stride_copy(%arg0: memref<32x64xf32>)  -> (memref<1x16x1xf32>) attrib
 
 // CHECK-LABEL: func.func @stride_copy
 // CHECK-NOT: memref.copy
+
+// -----
+
+func.func @byre_alias(%arg0: memref<512x200xf32>, %arg1: memref<512x200xf32>) -> (memref<256x256xf32>) attributes {__placeholder__byre.entry_point} {
+  %subview = memref.subview %arg0[0, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1]>>
+  %subview_0 = memref.subview %arg1[10, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1], offset: 2000>>
+  %collapse_shape = memref.collapse_shape %subview [[0, 1]] : memref<128x200xf32, strided<[200, 1]>> into memref<25600xf32, strided<[1]>>
+  %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] output_shape [256, 100] : memref<25600xf32, strided<[1]>> into memref<256x100xf32>
+  %collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
+  %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
+  %alloc = memref.alloc() : memref<256x256xf32>
+  %alloc_3 = memref.alloc() : memref<100x256xf32>
+  memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
+  byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
+  return %alloc : memref<256x256xf32>
+}
+
+// CHECK-LABEL: func.func @byre_alias
+// CHECK-NOT:  memref.copy
+// CHECK:      byre.alias
diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/5_alternative_scf_opt.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/5_alternative_scf_opt.mlir
@@ -32,10 +32,9 @@ module {
     %collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
     %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
     %alloc = memref.alloc() : memref<256x256xf32>
-    %alloc_3 = memref.alloc() : memref<100x256xf32>
-    memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
-    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
-    %0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
-    return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
+    %0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
+    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
+    %1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
+    return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
   }
 }
diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/6_gpu_opt.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/6_gpu_opt.mlir
@@ -27,10 +27,9 @@ module {
     %collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
     %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
     %alloc = memref.alloc() : memref<256x256xf32>
-    %alloc_3 = memref.alloc() : memref<100x256xf32>
-    memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
-    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
-    %0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
-    return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
+    %0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
+    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
+    %1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
+    return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
   }
 }
diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/7_set_space_opt.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/7_set_space_opt.mlir
@@ -41,10 +41,9 @@ module attributes {gpu.container_module} {
     %collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
     %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
     %alloc = memref.alloc() : memref<256x256xf32>
-    %alloc_3 = memref.alloc() : memref<100x256xf32>
-    memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
-    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
-    %0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
-    return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
+    %0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
+    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
+    %1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
+    return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
   }
 }
diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/8_byre_opt.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/8_byre_opt.mlir
@@ -34,10 +34,9 @@ module attributes {gpu.container_module} {
     %collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>, "cuda"> into memref<25600xf32, strided<[1], offset: 2000>, "cuda">
     %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>, "cuda"> into memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">
     %alloc = memref.alloc() : memref<256x256xf32, "cuda">
-    %alloc_3 = memref.alloc() : memref<100x256xf32, "cuda">
-    memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda"> to memref<100x256xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
-    %0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
-    return %alloc, %0 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
+    %0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> {device = "cuda"} : (memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">) -> memref<100x256xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
+    %1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
+    return %alloc, %1 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
   }
 }
diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/9a_byre_host.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/9a_byre_host.mlir
@@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
     }
   }
   func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<102400xi8, "cuda">
     %0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
-    %1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
-    %2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
-    byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
+    %1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
     byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
     byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
     return

diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/9b_nvvm_codegen.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/9b_nvvm_codegen.mlir
@@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
     }
   }
   func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<102400xi8, "cuda">
     %0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
-    %1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
-    %2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
-    byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
+    %1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
     byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
     byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
     return

diff --git a/compiler/test/E2E/CUDA/AliasLikeGPU/host_output.mlir b/compiler/test/E2E/CUDA/AliasLikeGPU/host_output.mlir
@@ -4,11 +4,8 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
-    %alloc = memref.alloc() : memref<102400xi8, "cuda">
     %0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
-    %1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
-    %2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
-    byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
+    %1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
     byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
     byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
     return