Enabling linking in the ROCM/CUDA compiler targets. (#18936)

This does exactly what the LLVMCPU side does - which is bad for compile time (serializes LLVM codegen) but much better for runtime. Future improvements should move LLVM codegen to the linking phase so it can happen in parallel and then perform the linking using LLVM's linker (each executable turned into a .o and then combined into a .so, or last-level bitcode if then we just want serialization to be bitcode to machine code). This is definitely a compile-time regression but we can't keep pessimizing runtime.
iree-org · Oct 29, 2024 · 49ffdac · 49ffdac
1 parent a321be2
commit 49ffdac
Show file tree

Hide file tree

Showing 26 changed files with 669 additions and 167 deletions.
diff --git a/compiler/plugins/target/CUDA/CUDATarget.cpp b/compiler/plugins/target/CUDA/CUDATarget.cpp
@@ -461,6 +461,10 @@ class CUDATargetBackend final : public TargetBackend {
     buildLLVMGPUCodegenPassPipeline(passManager, false);
   }
 
+  void buildLinkingPassPipeline(OpPassManager &passManager) override {
+    buildLLVMGPULinkingPassPipeline(passManager, "cuda");
+  }
+
   LogicalResult serializeExecutable(const SerializationOptions &serOptions,
                                     IREE::HAL::ExecutableVariantOp variantOp,
                                     OpBuilder &executableBuilder) override {

diff --git a/compiler/plugins/target/CUDA/test/smoketest.mlir b/compiler/plugins/target/CUDA/test/smoketest.mlir
@@ -1,8 +1,6 @@
 // RUN: iree-opt --split-input-file --iree-hal-transformation-pipeline --iree-gpu-test-target=sm_60 %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-hal-transformation-pipeline --iree-gpu-test-target=sm_60 --iree-hal-dump-executable-binaries-to=- %s 2>&1 | FileCheck %s --check-prefix=PTX
 
-#map = affine_map<(d0) -> (d0)>
-
 module attributes {
   hal.device.targets = [
     #hal.device.target<"cuda", [
@@ -11,13 +9,13 @@ module attributes {
   ]
 } {
 
-stream.executable public @add_dispatch_0 {
-  stream.executable.export @add_dispatch_0 workgroups(%arg0 : index) -> (index, index, index) {
+stream.executable public @add_dispatch_executable {
+  stream.executable.export @add_dispatch workgroups(%arg0 : index) -> (index, index, index) {
     %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module  {
-    func.func @add_dispatch_0(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
+    func.func @add_dispatch(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
       %arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
       %arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
@@ -26,7 +24,7 @@ stream.executable public @add_dispatch_0 {
       %1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
       %2 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
       %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1, %2 : tensor<16xf32>, tensor<16xf32>) outs(%0 : tensor<16xf32>) {
-      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
         %4 = arith.addf %arg3, %arg4 : f32
         linalg.yield %4 : f32
       } -> tensor<16xf32>
@@ -36,12 +34,42 @@ stream.executable public @add_dispatch_0 {
   }
 }
 
+stream.executable public @mul_dispatch_executable {
+  stream.executable.export @mul_dispatch workgroups(%arg0 : index) -> (index, index, index) {
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    stream.return %x, %y, %z : index, index, index
+  }
+  builtin.module  {
+    func.func @mul_dispatch(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+      %arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+      %arg2 = stream.binding.subspan %arg2_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+      %0 = tensor.empty() : tensor<16xf32>
+      %1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+      %2 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1, %2 : tensor<16xf32>, tensor<16xf32>) outs(%0 : tensor<16xf32>) {
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+        %4 = arith.mulf %arg3, %arg4 : f32
+        linalg.yield %4 : f32
+      } -> tensor<16xf32>
+      flow.dispatch.tensor.store %3, %arg2, offsets=[0], sizes=[16], strides=[1] : tensor<16xf32> -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+      return
+    }
+  }
+}
+
 }
 
-// PTX: .entry add_dispatch_0
+// PTX: .entry add_dispatch
 // PTX: .maxntid 64, 1, 1
 // PTX:   add.rn.f32
 
-//      CHECK:   hal.executable.binary public @cuda_nvptx_fb attributes {
+// PTX: .entry mul_dispatch
+// PTX: .maxntid 64, 1, 1
+// PTX:   mul.rn.f32
+
+//      CHECK: hal.executable public @smoketest_linked
+// CHECK-NEXT:   hal.executable.binary public @cuda_nvptx_fb attributes {
 // CHECK-SAME:     data = dense
 // CHECK-SAME:     format = "cuda-nvptx-fb"
diff --git a/compiler/plugins/target/LLVMCPU/LLVMCPUTarget.cpp b/compiler/plugins/target/LLVMCPU/LLVMCPUTarget.cpp
@@ -241,7 +241,7 @@ class LLVMCPUTargetBackend final : public TargetBackend {
   }
 
   void buildLinkingPassPipeline(OpPassManager &passManager) override {
-    buildLLVMCPULinkingPassPipeline(passManager);
+    buildLLVMCPULinkingPassPipeline(passManager, "llvm-cpu");
   }
 
   // Gets the LLVM target from |variantOp|.

diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -269,6 +269,10 @@ class ROCMTargetBackend final : public TargetBackend {
     buildLLVMGPUCodegenPassPipeline(passManager, true);
   }
 
+  void buildLinkingPassPipeline(OpPassManager &passManager) override {
+    buildLLVMGPULinkingPassPipeline(passManager, "rocm");
+  }
+
   // Performs optimizations on |module| (including LTO-style whole-program
   // ones). Inspired by code section in
   // https://github.com/iree-org/iree/blob/main/compiler/plugins/target/CUDA/CUDATarget.cpp

diff --git a/compiler/plugins/target/ROCM/test/smoketest.mlir b/compiler/plugins/target/ROCM/test/smoketest.mlir
@@ -2,19 +2,19 @@
 
 module attributes {
   hal.device.targets = [
-    #hal.device.target<"hip", [
-      #hal.executable.target<"rocm", "rocm-hsaco-fb">
+    #hal.device.target<"amdgpu", [
+      #hal.executable.target<"rocm", "amdgcn-amd-amdhsa">
     ]> : !hal.device
   ]
 } {
 
-stream.executable public @add_dispatch_0 {
-  stream.executable.export @add_dispatch_0 workgroups(%arg0 : index) -> (index, index, index) {
+stream.executable public @add_dispatch_executable {
+  stream.executable.export @add_dispatch workgroups(%arg0 : index) -> (index, index, index) {
     %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module  {
-    func.func @add_dispatch_0(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
+    func.func @add_dispatch(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
       %arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
       %arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
@@ -23,7 +23,7 @@ stream.executable public @add_dispatch_0 {
       %1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
       %2 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
       %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1, %2 : tensor<16xf32>, tensor<16xf32>) outs(%0 : tensor<16xf32>) {
-      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
         %4 = arith.addf %arg3, %arg4 : f32
         linalg.yield %4 : f32
       } -> tensor<16xf32>
@@ -33,11 +33,37 @@ stream.executable public @add_dispatch_0 {
   }
 }
 
+stream.executable public @mul_dispatch_executable {
+  stream.executable.export @mul_dispatch workgroups(%arg0 : index) -> (index, index, index) {
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    stream.return %x, %y, %z : index, index, index
+  }
+  builtin.module  {
+    func.func @mul_dispatch(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+      %arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+      %arg2 = stream.binding.subspan %arg2_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+      %0 = tensor.empty() : tensor<16xf32>
+      %1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+      %2 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1, %2 : tensor<16xf32>, tensor<16xf32>) outs(%0 : tensor<16xf32>) {
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+        %4 = arith.mulf %arg3, %arg4 : f32
+        linalg.yield %4 : f32
+      } -> tensor<16xf32>
+      flow.dispatch.tensor.store %3, %arg2, offsets=[0], sizes=[16], strides=[1] : tensor<16xf32> -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+      return
+    }
+  }
+}
+
 }
 
-//      CHECK:   hal.executable.binary public @rocm_hsaco_fb attributes {
+//      CHECK:   hal.executable public @smoketest_linked
+//      CHECK:   hal.executable.binary public @amdgcn_amd_amdhsa attributes {
 // CHECK-SAME:     data = dense
-// CHECK-SAME:     format = "rocm-hsaco-fb"
+// CHECK-SAME:     format = "amdgcn-amd-amdhsa"
 
 // -----
 
@@ -52,13 +78,13 @@ module attributes {
   ]
 } {
 
-stream.executable public @add_dispatch_0 {
-  stream.executable.export @add_dispatch_0 workgroups(%arg0 : index) -> (index, index, index) {
+stream.executable public @executable {
+  stream.executable.export @export workgroups(%arg0 : index) -> (index, index, index) {
     %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
     stream.return %x, %y, %z : index, index, index
   } loc(#loc)
   builtin.module  {
-    func.func @add_dispatch_0() {
+    func.func @export() {
       return
     } loc(#loc)
   } loc(#loc)