From 0504f7ad3662ce9a2044ea38f3a7f3df9b6708fc Mon Sep 17 00:00:00 2001 From: pvasireddy-amd Date: Tue, 11 Jun 2024 13:18:48 -0600 Subject: [PATCH] Changes to NpuDmaMemcpyNdOp and AIEDmaToNpu to support sub-word strides, offsets and sizes (#1538) Co-authored-by: Joseph Melber --- include/aie-c/TargetModel.h | 4 ++ include/aie/Dialect/AIE/IR/AIETargetModel.h | 9 +++ lib/CAPI/TargetModel.cpp | 4 ++ lib/Dialect/AIEX/IR/AIEXDialect.cpp | 12 +++- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 16 ++++++ .../matrix_vector/aie2.py | 36 +++++------- .../matrix_multiplication/single_core/aie2.py | 56 +++++++------------ .../basic/passthrough_kernel/aie2.py | 11 ++-- programming_examples/basic/vector_exp/aie2.py | 15 +---- .../basic/vector_scalar_mul/aie2.py | 12 ++-- programming_examples/ml/bottleneck/aie2.py | 20 +++---- programming_examples/ml/conv2d/aie2.py | 14 ++--- .../ml/conv2d_fused_relu/aie2.py | 14 ++--- programming_examples/ml/eltwise_add/aie2.py | 18 ++---- programming_examples/ml/eltwise_mul/aie2.py | 18 ++---- programming_examples/ml/relu/aie2.py | 13 +---- .../ml/resnet/layers_conv2_x/aie2.py | 42 +++++++------- programming_examples/ml/softmax/aie2.py | 13 +---- .../vision/color_detect/aie2_colorDetect.py | 11 ++-- .../color_threshold/aie2_colorThreshold.py | 9 ++- .../vision/edge_detect/aie2_edgeDetect.py | 8 +-- .../vision/vision_passthrough/aie2.py | 8 +-- test/Conversion/DmaToNpu/bad_dma_to_npu.mlir | 29 ++++++++++ .../DmaToNpu/bad_dma_to_npu_datatype.mlir | 29 ++++++++++ .../DmaToNpu/dma_to_npu_width_conversion.mlir | 40 +++++++++++++ test/dialect/AIEX/bad_npu_nd.mlir | 14 ----- 26 files changed, 254 insertions(+), 221 deletions(-) create mode 100644 test/Conversion/DmaToNpu/bad_dma_to_npu.mlir create mode 100644 test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir create mode 100644 test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir diff --git a/include/aie-c/TargetModel.h b/include/aie-c/TargetModel.h index c2c026fd37..59da2f2bf9 100644 --- a/include/aie-c/TargetModel.h +++ b/include/aie-c/TargetModel.h @@ -42,6 +42,10 @@ DEFINE_C_API_STRUCT(AieTargetModel, uint64_t); MLIR_CAPI_EXPORTED AieTargetModel aieGetTargetModel(uint32_t device); +/// Returns the data bus width for the target model. +MLIR_CAPI_EXPORTED uint32_t +aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel); + /// Returns the number of columns in the target model. MLIR_CAPI_EXPORTED int aieTargetModelColumns(AieTargetModel targetModel); diff --git a/include/aie/Dialect/AIE/IR/AIETargetModel.h b/include/aie/Dialect/AIE/IR/AIETargetModel.h index e9f5de7680..a6ec03d230 100644 --- a/include/aie/Dialect/AIE/IR/AIETargetModel.h +++ b/include/aie/Dialect/AIE/IR/AIETargetModel.h @@ -61,6 +61,9 @@ class AIETargetModel { /// Return the target architecture. virtual AIEArch getTargetArch() const = 0; + /// Return the data bus width of the device. + virtual uint32_t getAddressGenGranularity() const = 0; + /// Return the number of columns in the device. virtual int columns() const = 0; @@ -293,6 +296,8 @@ class AIE2TargetModel : public AIETargetModel { AIEArch getTargetArch() const override; + uint32_t getAddressGenGranularity() const override { return 32; } + std::optional getMemWest(TileID src) const override; std::optional getMemEast(TileID src) const override; std::optional getMemNorth(TileID src) const override; @@ -352,6 +357,8 @@ class VC1902TargetModel : public AIE1TargetModel { public: VC1902TargetModel() = default; + uint32_t getAddressGenGranularity() const override { return 32; } + int columns() const override { return 50; } int rows() const override { return 9; /* One Shim row and 8 Core rows. */ } @@ -532,6 +539,8 @@ class VirtualizedNPUTargetModel : public BaseNPUTargetModel { public: VirtualizedNPUTargetModel(int _cols) : cols(_cols) {} + uint32_t getAddressGenGranularity() const override { return 32; } + int columns() const override { return cols; } bool isShimNOCTile(int col, int row) const override { return row == 0; } diff --git a/lib/CAPI/TargetModel.cpp b/lib/CAPI/TargetModel.cpp index 9c41828871..bd5b33bd6e 100644 --- a/lib/CAPI/TargetModel.cpp +++ b/lib/CAPI/TargetModel.cpp @@ -28,6 +28,10 @@ AieTargetModel aieGetTargetModel(uint32_t device) { xilinx::AIE::getTargetModel(static_cast(device))); } +uint32_t aieGetTargetModelAddressGenGranularity(AieTargetModel targetModel) { + return unwrap(targetModel).getAddressGenGranularity(); +} + int aieTargetModelColumns(AieTargetModel targetModel) { return unwrap(targetModel).columns(); } diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 57d5a13c2e..e1102b4fe3 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -66,8 +66,16 @@ LogicalResult AIEX::BroadcastPacketOp::verify() { LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { MemRefType buffer = getMemref().getType(); - if (buffer.getElementTypeBitWidth() != 32) - return emitOpError("must be used with memref type with element width 32."); + const auto &targetModel = AIE::getTargetModel(*this); + auto addressGranularity = targetModel.getAddressGenGranularity(); + if (buffer.getElementTypeBitWidth() > addressGranularity) { + return emitOpError("Maximum element bit width allowed is ") + << addressGranularity << "bits. "; + } else if ((buffer.getNumElements() * buffer.getElementTypeBitWidth()) < + addressGranularity) { + return emitOpError("Minimum data transfer size required is ") + << addressGranularity << "bits. "; + } if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) { return getConstantIntValue(s).has_value(); })) diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index e5b1332916..95f090776c 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -230,6 +230,22 @@ struct DmaToNpuPattern : OpConversionPattern { llvm::reverse(op.getMixedOffsets()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); + MemRefType buffer = op.getMemref().getType(); + const auto &targetModel = AIE::getTargetModel(op); + auto elemWidth = buffer.getElementTypeBitWidth(); + auto addressGranularity = targetModel.getAddressGenGranularity(); + if (elemWidth < addressGranularity) { + if (!strides.empty()) { + for (int i = 0; i < 3; i++) { + strides[i] = (strides[i] * elemWidth) / addressGranularity; + } + } + if (!sizes.empty()) + sizes[0] = (sizes[0] * elemWidth) / addressGranularity; + if (!offsets.empty()) + offsets[0] = (offsets[0] * elemWidth) / addressGranularity; + } + // column column = IntegerAttr::get(i32ty, col); diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 6b27d9f9e3..54276121c8 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -17,26 +17,20 @@ def my_matmul(): K = 288 m = 32 k = 32 - word_size_in = 2 - word_size_out = 4 n_cores = 1 - A_sz_in_i32s = M * K * word_size_in // 4 - B_sz_in_i32s = K * word_size_in // 4 - C_sz_in_bytes = M * word_size_out - C_sz_in_i32s = C_sz_in_bytes // 4 - C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores + A_sz = M * K + B_sz = K + C_sz = M + C_sz_div_n_cores = C_sz // n_cores M_div_m = M // m M_div_m_div_n_cores = M // (m * n_cores) K_div_k = K // k - K_in_i32s = K * word_size_in // 4 - k_in_i32s = k * word_size_in // 4 - m_in_i32s = m * word_size_in // 4 - m_x_k_in_i32s = m * k * word_size_in // 4 - m_x_K_in_i32s = m * K * word_size_in // 4 + m_x_k = m * k + m_x_K = m * K vectorized = True @@ -172,35 +166,35 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz_in_i32s, T.i32()), - T.memref(B_sz_in_i32s, T.i32()), - T.memref(C_sz_in_i32s, T.i32()), + T.memref(A_sz, T.bf16()), + T.memref(B_sz, T.bf16()), + T.memref(C_sz, T.f32()), ) def sequence(A, B, C): npu_dma_memcpy_nd( metadata=inB_fifo_names[0], bd_id=2, mem=B, - sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s], + sizes=[M_div_m_div_n_cores, 1, 1, K], strides=[0, 0, 0], ) for i in range(n_cores): - A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4 - C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4 + A_offset = i * M_div_m_div_n_cores * m * K + C_offset = i * M_div_m_div_n_cores * m npu_dma_memcpy_nd( metadata=memA_fifo_names[i], bd_id=1, mem=A, offsets=[0, 0, 0, A_offset], - sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s], - strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s], + sizes=[M_div_m_div_n_cores, K_div_k, m, k], + strides=[m_x_K, k, K], ) npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, mem=C, offsets=[0, 0, 0, C_offset], - sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s], + sizes=[1, 1, 1, C_sz_div_n_cores], strides=[0, 0, 0], ) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 909fba0c43..ba312aa417 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -22,36 +22,26 @@ def my_matmul(): r = 4 s = 8 t = 4 - word_size_in = 2 - word_size_out = 2 vectorized = True enable_tracing = False trace_size = 65536 - A_sz_in_i32s = M * K * word_size_in // 4 - B_sz_in_i32s = K * N * word_size_in // 4 - C_sz_in_bytes = M * N * word_size_out - C_sz_in_i32s = C_sz_in_bytes // 4 + A_sz = M * K + B_sz = K * N + C_sz = M * N + C_sz_in_bytes = C_sz * 2 M_div_m = M // m K_div_k = K // k N_div_n = N // n tiles = M_div_m * N_div_n - # Matrix A: MxK, submatrices a: mxk - k_in_i32s = k * word_size_in // 4 - K_in_i32s = K * word_size_in // 4 - # Matrix B: KxN, submatrices b: kxn - n_in_i32s = n * word_size_in // 4 - N_in_i32s = N * word_size_in // 4 - k_x_N_in_i32s = k * N * word_size_in // 4 + k_x_N = k * N # Output Matrix C: MxN - n_in_i32s_out = n * word_size_out // 4 - N_in_i32s_out = N * word_size_out // 4 - m_x_N_in_i32s_out = m * N * word_size_out // 4 + m_x_N = m * N with mlir_mod_ctx() as ctx: @@ -169,9 +159,9 @@ def core_body(): # To/from AIE-array data movement @FuncOp.from_py_func( - T.memref(A_sz_in_i32s, T.i32()), - T.memref(B_sz_in_i32s, T.i32()), - T.memref(C_sz_in_i32s, T.i32()), + T.memref(A_sz, T.bf16()), + T.memref(B_sz, T.bf16()), + T.memref(C_sz, T.bf16()), ) def sequence(A, B, C): @@ -189,9 +179,7 @@ def sequence(A, B, C): for tile_row_block in range( (M_div_m + rows_per_block - 1) // rows_per_block ): - C_row_offset_in_i32s = ( - tile_row_block * rows_per_block * m * N * word_size_out // 4 - ) + C_row_offset = tile_row_block * rows_per_block * m * N num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) @@ -199,32 +187,28 @@ def sequence(A, B, C): metadata="outC", bd_id=0, mem=C, - offsets=[0, 0, 0, C_row_offset_in_i32s], - sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out], - strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out], + offsets=[0, 0, 0, C_row_offset], + sizes=[num_tile_rows, N_div_n, m, n], + strides=[m_x_N, n, N], ) for tile_row in range(num_tile_rows): - A_row_offset_in_i32s = ( - ((tile_row_block * rows_per_block) + tile_row) - * m - * K - * word_size_in - // 4 + A_row_offset = ( + ((tile_row_block * rows_per_block) + tile_row) * m * K ) npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, mem=A, - offsets=[0, 0, 0, A_row_offset_in_i32s], - sizes=[N_div_n, K_div_k, m, k_in_i32s], - strides=[0, k_in_i32s, K_in_i32s], + offsets=[0, 0, 0, A_row_offset], + sizes=[N_div_n, K_div_k, m, k], + strides=[0, k, K], ) npu_dma_memcpy_nd( metadata="inB", bd_id=2 * tile_row + 2, mem=B, - sizes=[N_div_n, K_div_k, k, n_in_i32s], - strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], + sizes=[N_div_n, K_div_k, k, n], + strides=[n, k_x_N, N], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 4fe9a7ed9b..fcd6c84632 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -19,7 +19,6 @@ def passthroughKernel(vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - lineWidthInInt32s = lineWidthInBytes // 4 @device(AIEDevice.npu1_1col) def device_body(): @@ -58,9 +57,7 @@ def core_body(): # print(ctx.module.operation.verify()) - tensorSize = N - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(N, T.ui8()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(inTensor, outTensor, notUsed): @@ -70,20 +67,20 @@ def sequence(inTensor, outTensor, notUsed): ShimTile, ddr_id=1, size=trace_size, - offset=tensorSize, + offset=N, ) npu_dma_memcpy_nd( metadata="in", bd_id=0, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, N], ) npu_dma_memcpy_nd( metadata="out", bd_id=1, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, N], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py index af58a6392b..87c8f33c31 100644 --- a/programming_examples/basic/vector_exp/aie2.py +++ b/programming_examples/basic/vector_exp/aie2.py @@ -17,12 +17,7 @@ # AI Engine structural design function def my_eltwise_exp(): - word_size_in = 2 N = 65536 - N_in_bytes = N * word_size_in - - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 # Tile sizes n = 1024 @@ -103,16 +98,12 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index 8d367ced50..b0a957393b 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -17,10 +17,8 @@ def my_vector_scalar(vector_size, trace_size): - word_size_in = 2 N = vector_size - N_in_i32s = N * word_size_in // 4 - N_in_bytes = N_in_i32s * 4 + N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n @@ -82,7 +80,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N_in_i32s, T.i32()) + tensor_ty = T.memref(N, T.i16()) scalar_ty = T.memref(1, T.i32()) @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty) @@ -96,10 +94,8 @@ def sequence(A, F, C): size=trace_size, offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s] - ) - npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s]) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py index ebdaf0d1b0..9ee60a3b62 100644 --- a/programming_examples/ml/bottleneck/aie2.py +++ b/programming_examples/ml/bottleneck/aie2.py @@ -501,16 +501,16 @@ def core_body(): yield_([]) # instruction stream generation - activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4 - acitivationsOutSize32b = activationsInSize32b - totalWeightsSize32b = ( + activationsIn = tensorInW * tensorInH * tensorInC + acitivationsOut = activationsIn + totalWeights = ( tensorL1InC * tensorL1OutC + 3 * 3 * tensorL2InC * tensorL2OutC + tensorL3InC * tensorL3OutC - ) // 4 + ) - activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) - weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty) + activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty) + weightsInL3_ty = MemRefType.get((totalWeights,), uint8_ty) @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty) def sequence(inputFromL3, weightsFromL3, outputToL3): @@ -568,7 +568,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): npu_writebd( bd_id=3, buffer_length=trace_sz_in_i32s, - buffer_offset=acitivationsOutSize32b, + buffer_offset=acitivationsOut, enable_packet=0, out_of_order_id=0, packet_id=0, @@ -616,19 +616,19 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): metadata="inOF_act_L3L2", bd_id=0, mem=inputFromL3, - sizes=[1, 1, 1, activationsInSize32b], + sizes=[1, 1, 1, activationsIn], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=outputToL3, - sizes=[1, 1, 1, acitivationsOutSize32b], + sizes=[1, 1, 1, acitivationsOut], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=1, mem=weightsFromL3, - sizes=[1, 1, 1, totalWeightsSize32b], + sizes=[1, 1, 1, totalWeights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py index 11e92f55c2..b6fb537a26 100644 --- a/programming_examples/ml/conv2d/aie2.py +++ b/programming_examples/ml/conv2d/aie2.py @@ -25,14 +25,11 @@ actIn = width * in_channels # 32*64 = 2048 bufIn = actIn * 2 # double buffer -actInInt32s = actIn // 4 weights = in_channels * out_channels -weightsInInt32s = weights // 4 actOut = width * out_channels # 32*64 = 2048 bufOut = actOut * 2 # double buffer -actOutInt32s = actOut // 4 def conv2dk1(): @@ -141,9 +138,8 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * in_channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) - memRef_wts_ty = T.memref(weightsInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) + memRef_wts_ty = T.memref(weights, T.i8()) # memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty) @@ -154,19 +150,19 @@ def sequence(I, W, O): metadata="inOF_act_L3L2", bd_id=0, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, - sizes=[1, 1, 1, weightsInInt32s], + sizes=[1, 1, 1, weights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py index 98ff6fe624..efd1b13555 100644 --- a/programming_examples/ml/conv2d_fused_relu/aie2.py +++ b/programming_examples/ml/conv2d_fused_relu/aie2.py @@ -25,14 +25,11 @@ actIn = width * in_channels # 32*64 = 2048 bufIn = actIn * 2 # double buffer -actInInt32s = actIn // 4 weights = in_channels * out_channels -weightsInInt32s = weights // 4 actOut = width * out_channels # 32*64 = 2048 bufOut = actOut * 2 # double buffer -actOutInt32s = actOut // 4 enableTrace = False trace_size = 16384 @@ -148,9 +145,8 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * in_channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) - memRef_wts_ty = T.memref(weightsInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) + memRef_wts_ty = T.memref(weights, T.i8()) # memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty) @@ -240,19 +236,19 @@ def sequence(I, W, O): metadata="inOF_act_L3L2", bd_id=0, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, - sizes=[1, 1, 1, weightsInInt32s], + sizes=[1, 1, 1, weights], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py index 354e9f78d1..4d0716fa1c 100644 --- a/programming_examples/ml/eltwise_add/aie2.py +++ b/programming_examples/ml/eltwise_add/aie2.py @@ -21,10 +21,6 @@ def my_eltwise_add(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - B_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -129,7 +125,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): @@ -143,15 +139,9 @@ def sequence(A, B, C): offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py index 5808d0c998..4966ecd06e 100644 --- a/programming_examples/ml/eltwise_mul/aie2.py +++ b/programming_examples/ml/eltwise_mul/aie2.py @@ -21,10 +21,6 @@ def my_eltwise_mul(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - B_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -130,7 +126,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): @@ -144,15 +140,9 @@ def sequence(A, B, C): offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py index e4da4eafdf..2d62135f27 100644 --- a/programming_examples/ml/relu/aie2.py +++ b/programming_examples/ml/relu/aie2.py @@ -21,9 +21,6 @@ def my_relu(trace_size): N = 65536 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -105,7 +102,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): @@ -118,12 +115,8 @@ def sequence(A, C): size=trace_size, offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 94f5888512..729ed2b0fb 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -894,33 +894,29 @@ def core_body(): yield_([]) # instruction stream generation - activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4 - acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4 + activationsIn = tensorInW * tensorInH * tensorInCInit + acitivationsOut = tensorInW * tensorInH * tensorInCRest - totalWeightsSize32b_init = ( + totalWeights_init = ( tensorInCInit * tensorInCInit + 3 * 3 * tensorInCInit * tensorInCInit + 2 * tensorInCInit * tensorInCRest - ) // 4 + ) - totalWeightsSize32b_rest = ( + totalWeights_rest = ( tensorInCInit * tensorInCRest + 3 * 3 * tensorInCInit * tensorInCInit + tensorInCInit * tensorInCRest - ) // 4 - - totalWeightsSize32b_complete = ( - totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest ) - activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) - activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty) - weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty) - weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty) + totalWeights_complete = totalWeights_init + repeat * totalWeights_rest - weightsInL3_ty_complete = MemRefType.get( - (totalWeightsSize32b_complete,), int32_ty - ) + activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty) + activationsOutL3_ty = MemRefType.get((acitivationsOut,), int8_ty) + weightsInL3_ty_init = MemRefType.get((totalWeights_init,), int8_ty) + weightsInL3_ty_rest = MemRefType.get((totalWeights_rest,), int8_ty) + + weightsInL3_ty_complete = MemRefType.get((totalWeights_complete,), int8_ty) @FuncOp.from_py_func( activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty @@ -950,27 +946,27 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): metadata="act1_00_02_01", bd_id=0, mem=inputFromL3, - sizes=[1, 1, 1, activationsInSize32b], + sizes=[1, 1, 1, activationsIn], ) npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=outputToL3, - sizes=[1, 1, 1, acitivationsOutSize32b], + sizes=[1, 1, 1, acitivationsOut], ) npu_dma_memcpy_nd( metadata="wts_0_L3L2", bd_id=1, mem=weightsFromL3, - sizes=[1, 1, 1, totalWeightsSize32b_init], + sizes=[1, 1, 1, totalWeights_init], ) npu_dma_memcpy_nd( metadata="wts_1_L3L2", bd_id=1, mem=weightsFromL3, - offsets=[0, 0, 0, totalWeightsSize32b_init], - sizes=[1, 1, 1, totalWeightsSize32b_rest], + offsets=[0, 0, 0, totalWeights_init], + sizes=[1, 1, 1, totalWeights_rest], ) npu_dma_memcpy_nd( @@ -981,9 +977,9 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): 0, 0, 0, - totalWeightsSize32b_init + totalWeightsSize32b_rest, + totalWeights_init + totalWeights_rest, ], - sizes=[1, 1, 1, totalWeightsSize32b_rest], + sizes=[1, 1, 1, totalWeights_rest], ) npu_sync(column=1, row=0, direction=0, channel=0) diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py index 47d60adf6a..812bd71781 100755 --- a/programming_examples/ml/softmax/aie2.py +++ b/programming_examples/ml/softmax/aie2.py @@ -22,9 +22,6 @@ def vector_softmax(trace_size): N = 262144 # *1024 N_in_bytes = N * word_size_in - A_sz_in_i32s = N_in_bytes // 4 - C_sz_in_i32s = N_in_bytes // 4 - # Tile sizes n = 1024 N_div_n = N // n @@ -108,7 +105,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N, T.bf16()) @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): @@ -122,12 +119,8 @@ def sequence(A, C): offset=N_in_bytes, ) - npu_dma_memcpy_nd( - metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] - ) - npu_dma_memcpy_nd( - metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] - ) + npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py index 19e4e04ca9..9a66785bbb 100644 --- a/programming_examples/vision/color_detect/aie2_colorDetect.py +++ b/programming_examples/vision/color_detect/aie2_colorDetect.py @@ -22,11 +22,9 @@ lineWidth = width lineWidthInBytes = width * 4 -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 +traceSize = 1024 def color_detect(): @@ -242,8 +240,7 @@ def coreBody(): # To/from AIE-array data movement tensorSize = width * height * 4 # 4 channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = MemRefType.get((tensorSizeInInt32s,), T.i32()) + tensor_ty = MemRefType.get((tensorSize,), T.i8()) memRef_16x16_ty = MemRefType.get( ( 16, @@ -258,13 +255,13 @@ def sequence(I, B, O): metadata="inOF_L3L2", bd_id=1, mem=I, - sizes=[1, 1, 1, height * lineWidthInInt32s], + sizes=[1, 1, 1, height * lineWidthInBytes], ) npu_dma_memcpy_nd( metadata="outOF_L2L3", bd_id=0, mem=O, - sizes=[1, 1, 1, height * lineWidthInInt32s], + sizes=[1, 1, 1, height * lineWidthInBytes], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index 1215a4ddd0..fa067226dc 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -247,12 +247,11 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height - tensorSizeInInt32s = tensorSize // 4 @FuncOp.from_py_func( - T.memref(tensorSizeInInt32s, T.i32()), + T.memref(tensorSize, T.i8()), T.memref(32, T.i32()), # not used - T.memref(tensorSizeInInt32s, T.i32()), + T.memref(tensorSize, T.i8()), ) def sequence(inTensor, notUsed, outTensor): # thresholdValue, maxValue, thresholdType @@ -276,13 +275,13 @@ def sequence(inTensor, notUsed, outTensor): metadata="inOOB_L3L2", bd_id=1, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="outOOB_L2L3", bd_id=0, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 1af069d94e..3e095e356d 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -22,7 +22,6 @@ heightMinus1 = height - 1 lineWidth = width lineWidthInBytes = width * 4 -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False traceSizeInBytes = 8192 @@ -294,8 +293,7 @@ def core_body(): # To/from AIE-array data movement tensorSize = width * height * 4 # 4 channels - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) memRef_16x16_ty = T.memref(16, 16, T.i32()) @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty) @@ -304,13 +302,13 @@ def sequence(I, B, O): metadata="outOF_L2L3", bd_id=0, mem=O, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="inOF_L3L2", bd_id=1, mem=I, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py index 8d568af388..35e1c5f515 100644 --- a/programming_examples/vision/vision_passthrough/aie2.py +++ b/programming_examples/vision/vision_passthrough/aie2.py @@ -19,7 +19,6 @@ height = int(sys.argv[2]) lineWidthInBytes = width -lineWidthInInt32s = lineWidthInBytes // 4 enableTrace = False traceSizeInBytes = 8192 @@ -68,8 +67,7 @@ def core_body(): # print(ctx.module.operation.verify()) tensorSize = width * height - tensorSizeInInt32s = tensorSize // 4 - tensor_ty = T.memref(tensorSizeInInt32s, T.i32()) + tensor_ty = T.memref(tensorSize, T.i8()) @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(inTensor, notUsed, outTensor): @@ -157,13 +155,13 @@ def sequence(inTensor, notUsed, outTensor): metadata="in", bd_id=1, mem=inTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_dma_memcpy_nd( metadata="out", bd_id=0, mem=outTensor, - sizes=[1, 1, 1, tensorSizeInInt32s], + sizes=[1, 1, 1, tensorSize], ) npu_sync(column=0, row=0, direction=0, channel=0) diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir new file mode 100644 index 0000000000..6f0ed03057 --- /dev/null +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir @@ -0,0 +1,29 @@ +//===- bad_dma_to_npu.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +// CHECK: error: 'aiex.npu.dma_memcpy_nd' op Minimum data transfer size required is 32bits. + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<1xbf16> + func.func @sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir new file mode 100644 index 0000000000..bb4af49938 --- /dev/null +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir @@ -0,0 +1,29 @@ +//===- bad_dma_to_npu_datatype.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +// CHECK: error: 'aiex.npu.dma_memcpy_nd' op Maximum element bit width allowed is 32bits. + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<65536xi64> + func.func @sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir new file mode 100644 index 0000000000..af75cd8b33 --- /dev/null +++ b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir @@ -0,0 +1,40 @@ +//===- dma_to_npu_width_conversion.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +// Date: July 3rd 2023 +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s + +//CHECK-LABEL: aie.device(xcve2302) { +//CHECK: memref.global "public" @toMem : memref<65536xbf16> +//CHECK: func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) { +//CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 8192 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 32 : i32, d0_stride = 0 : i32, d1_size = 64 : i32, d1_stride = 127 : i32, d2_stride = 31 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +//CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} +//CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147680256 : ui32} +//CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} +//CHECK: return +//CHECK: } +//CHECK: aie.shim_dma_allocation @toMem(S2MM, 0, 0) +//CHECK: } + + + +module @shimDmaMemcpy{ + aie.device(xcve2302) { + memref.global "public" @toMem : memref<65536xbf16> + func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + aie.shim_dma_allocation @toMem (S2MM, 0, 0) + } +} + diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir index 7881d80cfe..c89587b3f1 100644 --- a/test/dialect/AIEX/bad_npu_nd.mlir +++ b/test/dialect/AIEX/bad_npu_nd.mlir @@ -66,17 +66,3 @@ module { // ----- -module { - aie.device(npu1_4col) { - func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { - %c0 = arith.constant 0 : i64 - %c1 = arith.constant 1 : i64 - %c1920 = arith.constant 1920 : i64 - %c1080 = arith.constant 1080 : i64 - // expected-error@+1 {{must be used with memref type with element width 32.}} - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> - return - } - aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) - } -}