From 5f49f3543f5769fdd39fa6f3b8960fda75fcb095 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Thu, 13 Jun 2024 09:40:24 -0600 Subject: [PATCH 1/4] merge kernels in existing XCLBIN --- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 75 +++++++++++++++---- .../driver/xrt/native_executable.cc | 26 +++++-- 2 files changed, 78 insertions(+), 23 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index a8d2d2060..e02cd0e2f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -268,6 +268,8 @@ LogicalResult AIETargetBackend::serializeExecutable( SmallVector xclbinIndices(ordinalCount); SmallVector asmInstrIndices(ordinalCount); + SmallVector> xclbinPaths; + for (size_t i = 0; i < entryPointNames.size(); i++) { uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]); @@ -300,18 +302,34 @@ LogicalResult AIETargetBackend::serializeExecutable( llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - SmallVector cmdArgs{aie2xclbin, - inputMlirPath, - "--peano", - options.peanoInstallDir, - "--xclbin-name", - xclbinPath, - "--npu-insts-name", - npuInstPath, - "--xclbin-kernel-name", - entryPointNamesFb[ordinal], - "--tmpdir", - entryPointWorkDir}; + // Convert ordinal to hexadecimal string for xclbin kern id + std::stringstream ss; + ss << "0x" << std::hex << ordinal + 10; + std::string ordinalHex = ss.str(); + + SmallVector cmdArgs; + SmallVector cmdArgsBase{aie2xclbin, + inputMlirPath, + "--peano", + options.peanoInstallDir, + "--xclbin-name", + xclbinPath, + "--npu-insts-name", + npuInstPath, + "--xclbin-kernel-name", + entryPointNamesFb[ordinal], + "--tmpdir", + entryPointWorkDir, + "--xclbin-kernel-id", + ordinalHex}; + cmdArgs = cmdArgsBase; + bool AttemptingMerge = false; + if (i > 0) { + cmdArgs.push_back("--input-xclbin-name"); + cmdArgs.push_back(xclbinPaths.back()); + AttemptingMerge = true; + } + xclbinPaths.push_back(xclbinPath); auto addOpt = [&](StringRef arg, bool value) { if (value) cmdArgs.push_back(arg); @@ -350,11 +368,24 @@ LogicalResult AIETargetBackend::serializeExecutable( { SmallVector cmdEnvRefs{cmdEnv.begin(), cmdEnv.end()}; int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs); - if (result != 0) + if (result != 0 && AttemptingMerge) { + // we failed to create xclbin but maybe we failed becuase we were trying + // to merge the kerenel in exisiting kernel, try again to see if perhaps + // we have success if we dont try to merge. + AttemptingMerge = false; + result = + llvm::sys::ExecuteAndWait(cmdArgsBase[0], cmdArgsBase, cmdEnvRefs); + xclbinPaths.push_back(xclbinPath); + } + if (result != 0) { return moduleOp.emitOpError( "Failed to produce an XCLBin with external tool."); + } + // delete the previous xclbin if we were able to merge as the new one now + // will have all the kernels from the previous one. + if (AttemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2); + xclbinIndices[ordinal] = xclbinPaths.size() - 1; } - std::ifstream instrFile(static_cast(npuInstPath)); std::string line; while (std::getline(instrFile, line)) { @@ -369,7 +400,7 @@ LogicalResult AIETargetBackend::serializeExecutable( asmInstrIndices[ordinal] = asmInstrRefs.size(); asmInstrRefs.push_back( iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); - + /* xclbinIn = openInputFile(xclbinPath, &errorMessage); if (!xclbinIn) { moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; @@ -378,7 +409,21 @@ LogicalResult AIETargetBackend::serializeExecutable( xclbinIndices[ordinal] = xclbinRefs.size(); xclbinRefs.push_back( iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); + */ + } + // write out the final xclbins to flatbuffer + for (auto xclbinPath : xclbinPaths) { + llvm::outs() << "writing xclbin from path: " << xclbinPath << "\n"; + std::string errorMessage; + xclbinIn = openInputFile(xclbinPath, &errorMessage); + if (!xclbinIn) { + moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; + } + auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); + xclbinRefs.push_back( + iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); } + // Serialize the executable to flatbuffer format auto entryPointsRef = builder.createStringVec(entryPointNamesFb); diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index 7481c836c..d572dbe4c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -128,6 +128,9 @@ iree_status_t iree_hal_xrt_native_executable_create( iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec = iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); + iree_host_size_t number_xclbin = + iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins_vec); + iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec = iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); @@ -163,17 +166,15 @@ iree_status_t iree_hal_xrt_native_executable_create( &executable->resource); executable->host_allocator = host_allocator; executable->entry_point_count = entry_point_count; - for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; - entry_ordinal++) { - const char* entry_name = - flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); - uint32_t xclbin_index = - flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); + // collect all the hardware contexts first as muliple entry points can map to + // the same context and this way we dont need to keep reloading them. + std::vector contexts; + for (iree_host_size_t xclbin_index = 0; xclbin_index < number_xclbin; + xclbin_index++) { iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def = iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index); flatbuffers_string_t xclbin_fb = iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); - // XRT API needs this vector and cant actually read a void*. std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); @@ -186,6 +187,14 @@ iree_status_t iree_hal_xrt_native_executable_create( } device.register_xclbin(xclbin); xrt::hw_context context(device, xclbin.get_uuid()); + contexts.push_back(context); + } + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + const char* entry_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + uint32_t xclbin_index = + flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = @@ -196,7 +205,8 @@ iree_status_t iree_hal_xrt_native_executable_create( std::unique_ptr kernel; std::unique_ptr instr; try { - kernel = std::make_unique(context, entry_name); + kernel = + std::make_unique(contexts[xclbin_index], entry_name); // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction // buffer that resides in instr_memory. This buffer is always passed as // the second argument to the kernel and we can use the From 3d864d926efc84356fa8fa39173343254c1189a7 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 17 Jun 2024 10:18:19 -0600 Subject: [PATCH 2/4] bump mlir aie wheel --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e9bbe923..8523252af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,7 +103,7 @@ jobs: run: | python3 -m venv .venv source .venv/bin/activate - pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061222+3ac9566-py3-none-manylinux_2_35_x86_64.whl + pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061622+18c8815-py3-none-manylinux_2_35_x86_64.whl pip install -r tests/matmul/requirements.txt From a11e4198f31fe65303d3fe50fa8e30d8d5f8d45b Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 17 Jun 2024 13:15:21 -0600 Subject: [PATCH 3/4] add test and minor fixes --- build_tools/ci/cpu_comparison/run_test.sh | 3 ++ .../test_files/three_matmuls.mlir | 31 +++++++++++++++++++ .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 18 +++-------- 3 files changed, 38 insertions(+), 14 deletions(-) create mode 100644 build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh index 8e3941138..2bf318043 100755 --- a/build_tools/ci/cpu_comparison/run_test.sh +++ b/build_tools/ci/cpu_comparison/run_test.sh @@ -303,6 +303,9 @@ function run_test() { run_test \ --test_file ${THIS_DIR}/test_files/matmul_int32.mlir +run_test \ + --test_file ${THIS_DIR}/test_files/three_matmuls.mlir + run_test \ --name_prefix "matmul" \ --lhs_rhs_type "bf16" \ diff --git a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir new file mode 100644 index 000000000..efa30612f --- /dev/null +++ b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir @@ -0,0 +1,31 @@ +// This test shows arbitory matmuls that would have producer consumer relationships +// across different dispatches running on CI. + +// These lines are strictly required by the script which generates input data: +// +// input 32x32xf32 +// input 32x32xf32 +// input 32x4xf32 +// input 4x32xf32 + +!A_TYPE = tensor<32x32xf32> +!B_TYPE = tensor<32x4xf32> +!C_TYPE = tensor <4x32xf32> +!D_TYPE = tensor <4x4xf32> +func.func @two_mm(%lhs : !A_TYPE, + %rhs : !A_TYPE, %rhs_2 : !B_TYPE, %lhs_2 : !C_TYPE) -> !D_TYPE { + %empty = tensor.empty() : !A_TYPE + %empty_2 = tensor.empty() : !B_TYPE + %empty_3 = tensor.empty() : !D_TYPE + %cst = arith.constant 0.0 : f32 + %fill = linalg.fill ins(%cst : f32) outs(%empty : !A_TYPE) -> !A_TYPE + %fill_2 = linalg.fill ins(%cst : f32) outs(%empty_2 : !B_TYPE) -> !B_TYPE + %fill_3 = linalg.fill ins(%cst : f32) outs(%empty_3 : !D_TYPE) -> !D_TYPE + %2 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !A_TYPE) + outs(%fill : !A_TYPE) -> !A_TYPE + %3 = linalg.matmul ins(%2, %rhs_2 : !A_TYPE, !B_TYPE) + outs(%fill_2 : !B_TYPE) -> !B_TYPE + %4 = linalg.matmul ins(%lhs_2, %3 : !C_TYPE, !B_TYPE) + outs(%fill_3 : !D_TYPE) -> !D_TYPE + return %4 : !D_TYPE +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index e02cd0e2f..c441f1240 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -302,7 +302,7 @@ LogicalResult AIETargetBackend::serializeExecutable( llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - // Convert ordinal to hexadecimal string for xclbin kern id + // Convert ordinal to hexadecimal string for xclbin kernel id. std::stringstream ss; ss << "0x" << std::hex << ordinal + 10; std::string ordinalHex = ss.str(); @@ -370,7 +370,7 @@ LogicalResult AIETargetBackend::serializeExecutable( int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs); if (result != 0 && AttemptingMerge) { // we failed to create xclbin but maybe we failed becuase we were trying - // to merge the kerenel in exisiting kernel, try again to see if perhaps + // to merge the kerenel in exisiting xclbin, try again to see if perhaps // we have success if we dont try to merge. AttemptingMerge = false; result = @@ -400,18 +400,8 @@ LogicalResult AIETargetBackend::serializeExecutable( asmInstrIndices[ordinal] = asmInstrRefs.size(); asmInstrRefs.push_back( iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); - /* - xclbinIn = openInputFile(xclbinPath, &errorMessage); - if (!xclbinIn) { - moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; - } - auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); - xclbinIndices[ordinal] = xclbinRefs.size(); - xclbinRefs.push_back( - iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); - */ } - // write out the final xclbins to flatbuffer + // Write out the final xclbins to flatbuffer. for (auto xclbinPath : xclbinPaths) { llvm::outs() << "writing xclbin from path: " << xclbinPath << "\n"; std::string errorMessage; @@ -424,7 +414,7 @@ LogicalResult AIETargetBackend::serializeExecutable( iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); } - // Serialize the executable to flatbuffer format + // Serialize the executable to flatbuffer format. auto entryPointsRef = builder.createStringVec(entryPointNamesFb); iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); From f5a63aca58ac4b1e2dd65ac807d419b46e8194be Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Tue, 18 Jun 2024 09:47:56 -0600 Subject: [PATCH 4/4] address reviwer comments --- .../ci/cpu_comparison/test_files/three_matmuls.mlir | 2 +- .../target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir index efa30612f..fb222380c 100644 --- a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir +++ b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir @@ -1,4 +1,4 @@ -// This test shows arbitory matmuls that would have producer consumer relationships +// This test shows arbitrary matmuls that would have producer consumer relationships // across different dispatches running on CI. // These lines are strictly required by the script which generates input data: diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index c441f1240..1a26c4f14 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -323,11 +323,11 @@ LogicalResult AIETargetBackend::serializeExecutable( "--xclbin-kernel-id", ordinalHex}; cmdArgs = cmdArgsBase; - bool AttemptingMerge = false; + bool attemptingMerge = false; if (i > 0) { cmdArgs.push_back("--input-xclbin-name"); cmdArgs.push_back(xclbinPaths.back()); - AttemptingMerge = true; + attemptingMerge = true; } xclbinPaths.push_back(xclbinPath); @@ -368,11 +368,11 @@ LogicalResult AIETargetBackend::serializeExecutable( { SmallVector cmdEnvRefs{cmdEnv.begin(), cmdEnv.end()}; int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs); - if (result != 0 && AttemptingMerge) { + if (result != 0 && attemptingMerge) { // we failed to create xclbin but maybe we failed becuase we were trying // to merge the kerenel in exisiting xclbin, try again to see if perhaps // we have success if we dont try to merge. - AttemptingMerge = false; + attemptingMerge = false; result = llvm::sys::ExecuteAndWait(cmdArgsBase[0], cmdArgsBase, cmdEnvRefs); xclbinPaths.push_back(xclbinPath); @@ -383,7 +383,7 @@ LogicalResult AIETargetBackend::serializeExecutable( } // delete the previous xclbin if we were able to merge as the new one now // will have all the kernels from the previous one. - if (AttemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2); + if (attemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2); xclbinIndices[ordinal] = xclbinPaths.size() - 1; } std::ifstream instrFile(static_cast(npuInstPath));