Skip to content

Commit

Permalink
Merge branch 'main' into lyq/update-torch-mlir
Browse files Browse the repository at this point in the history
  • Loading branch information
qingyunqu authored Aug 14, 2024
2 parents c2b350c + 3b1ad12 commit d1b448f
Show file tree
Hide file tree
Showing 11 changed files with 69 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/daily_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ jobs:
- name: Checkout byteir repo
uses: actions/checkout@v3
- name: Build and test e2e
run: ./scripts/e2e/build_and_test_e2e.sh
run: ./tests/build_and_test_e2e.sh
shell: bash
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class FuncOp;
} // namespace func

void populateRemoveCopyAfterBufferizationPattern(RewritePatternSet &patterns,
DominanceInfo &domInfo);
DominanceInfo &domInfo,
bool enableByreAlias);

std::unique_ptr<OperationPass<func::FuncOp>> createRemoveCopyPass();

Expand Down
34 changes: 27 additions & 7 deletions compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
//===----------------------------------------------------------------------===//

#include "byteir/Dialect/MemRef/Transforms/RemoveCopy.h"
#include "byteir/Dialect/Byre/ByreDialect.h"
#include "byteir/Dialect/MemRef/Utils/MemEffect.h"
#include "byteir/Utils/Hoist.h"
#include "byteir/Utils/MemUtils.h"
#include "byteir/Utils/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
Expand Down Expand Up @@ -144,8 +146,10 @@ int64_t extractOffset(MemRefType memref) {

class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
public:
RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom)
: OpRewritePattern(context), domInfo(dom) {}
RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom,
bool enableByreAlias)
: OpRewritePattern(context), domInfo(dom),
enableByreAlias(enableByreAlias) {}

LogicalResult matchAndRewrite(memref::CopyOp copyOp,
PatternRewriter &rewriter) const override {
Expand Down Expand Up @@ -286,6 +290,8 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {

auto sourceMemref = src.getType().cast<MemRefType>();
auto targetMemref = target.getType().cast<MemRefType>();
// target generated by memref.alloc(), it must be identity.
assert(targetMemref.getLayout().isIdentity());
int64_t srcMemrefOffset = 0;
int64_t tgtMemrefOffset = 0;
SmallVector<int64_t> srcStrides;
Expand All @@ -310,8 +316,13 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
copyOp.getLoc(), targetMemref, src, tgtMemrefOffset,
targetMemref.getShape(), tgtStrides);
} else {
// TODO: use some op like memref.reinterpret_cast to handle offset
return failure();
if (this->enableByreAlias) {
// use byre.alias to decouple offset from memref type
srcCast = rewriter.create<byre::AliasOp>(
copyOp.getLoc(), targetMemref, src, srcMemrefOffset);
} else {
return failure();
}
}
} else {
srcCast = rewriter.create<memref::CastOp>(copyOp.getLoc(),
Expand Down Expand Up @@ -392,6 +403,7 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {

private:
DominanceInfo &domInfo;
bool enableByreAlias;
};

struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
Expand All @@ -400,10 +412,17 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
void runOnOperation() override {

func::FuncOp funcOp = getOperation();
bool isByreEntryFunc =
funcOp->hasAttrOfType<UnitAttr>(
byre::ByreDialect::getEntryPointFunctionAttrName()) ||
funcOp->hasAttrOfType<UnitAttr>(getAttrPlaceholderName(
byre::ByreDialect::getEntryPointFunctionAttrName()));

auto &domInfo = getAnalysis<DominanceInfo>();
auto &ctx = getContext();
RewritePatternSet patterns(&ctx);
populateRemoveCopyAfterBufferizationPattern(patterns, domInfo);
populateRemoveCopyAfterBufferizationPattern(
patterns, domInfo, /*enableByreAlias=*/isByreEntryFunc);

// also insert related canonicalizer
memref::AllocOp::getCanonicalizationPatterns(patterns, &ctx);
Expand All @@ -429,8 +448,9 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
} // namespace

void mlir::populateRemoveCopyAfterBufferizationPattern(
RewritePatternSet &patterns, DominanceInfo &domInfo) {
patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo);
RewritePatternSet &patterns, DominanceInfo &domInfo, bool enableByreAlias) {
patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo,
enableByreAlias);
}

std::unique_ptr<OperationPass<func::FuncOp>> mlir::createRemoveCopyPass() {
Expand Down
20 changes: 20 additions & 0 deletions compiler/test/Dialect/MemRef/removeCopy.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -667,3 +667,23 @@ func.func @stride_copy(%arg0: memref<32x64xf32>) -> (memref<1x16x1xf32>) attrib

// CHECK-LABEL: func.func @stride_copy
// CHECK-NOT: memref.copy

// -----

func.func @byre_alias(%arg0: memref<512x200xf32>, %arg1: memref<512x200xf32>) -> (memref<256x256xf32>) attributes {__placeholder__byre.entry_point} {
%subview = memref.subview %arg0[0, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1]>>
%subview_0 = memref.subview %arg1[10, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1], offset: 2000>>
%collapse_shape = memref.collapse_shape %subview [[0, 1]] : memref<128x200xf32, strided<[200, 1]>> into memref<25600xf32, strided<[1]>>
%expand_shape = memref.expand_shape %collapse_shape [[0, 1]] output_shape [256, 100] : memref<25600xf32, strided<[1]>> into memref<256x100xf32>
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
return %alloc : memref<256x256xf32>
}

// CHECK-LABEL: func.func @byre_alias
// CHECK-NOT: memref.copy
// CHECK: byre.alias
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ module {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/6_gpu_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ module {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/7_set_space_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ module attributes {gpu.container_module} {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/8_byre_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ module attributes {gpu.container_module} {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>, "cuda"> into memref<25600xf32, strided<[1], offset: 2000>, "cuda">
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>, "cuda"> into memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">
%alloc = memref.alloc() : memref<256x256xf32, "cuda">
%alloc_3 = memref.alloc() : memref<100x256xf32, "cuda">
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda"> to memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
return %alloc, %0 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> {device = "cuda"} : (memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
return %alloc, %1 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
}
}
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/9a_byre_host.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
}
}
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/9b_nvvm_codegen.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
}
}
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/host_output.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@

module attributes {byre.container_module, gpu.container_module} {
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down

0 comments on commit d1b448f

Please sign in to comment.