Skip to content

Commit

Permalink
ROCm FileCheck pattern fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ekuznetsov139 authored and mmakevic-amd committed Jun 20, 2024
1 parent 32598f2 commit c09263e
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 66 deletions.
2 changes: 1 addition & 1 deletion xla/service/gpu/tests/add_preds.hlo
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s

// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
// CHECK: define{{( amdgpu_kernel)?}} void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
// CHECK: %[[A:.*]] = load {{.*}} ptr %[[ARG0]]
// CHECK: %[[B:.*]] = load {{.*}} ptr %[[ARG1]]
// CHECK: or {{.*}} %[[A]], %[[B]]
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/tests/fused_scatter.hlo
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py

// CHECK: define void @wrapped_scatter
// CHECK: define{{( amdgpu_kernel)?}} void @wrapped_scatter
// CHECK: %[[VAL_70:.*]] = alloca i32, align 4
// CHECK-PTX: %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
// CHECK-GCN: %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
Expand Down
20 changes: 10 additions & 10 deletions xla/service/gpu/tests/launch_dimensions.hlo
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// This tests that we do not increase the grid launch size when
// few_waves is enabled.

// CHECK-LABEL: define void @wrapped_b
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
Expand All @@ -27,7 +27,7 @@ ENTRY main {

// This tests that we cap grid launch code when few_waves is enabled.

// CHECK-LABEL: define void @wrapped_b
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
Expand All @@ -53,7 +53,7 @@ ENTRY main {
// This tests that we cap grid launch code when few_waves is enabled
// and scalar broadcast are present.

// CHECK-LABEL: define void @fusion_3
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion_3
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
Expand Down Expand Up @@ -84,7 +84,7 @@ ENTRY main {
// This tests that we enable few_waves in a simple fusion. It is the baseline
// for the tests below.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
Expand Down Expand Up @@ -113,7 +113,7 @@ ENTRY main {

// This tests that we keep few_waves enabled for large constants.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
Expand Down Expand Up @@ -141,7 +141,7 @@ ENTRY main {

// This tests that we disable few_waves if a non-elementwise op is present.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313}
Expand Down Expand Up @@ -175,7 +175,7 @@ ENTRY main {
// - the fusion is not row-vectorizable
// It serves as a baseline for the tests below.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813}
Expand Down Expand Up @@ -219,7 +219,7 @@ ENTRY main {
// - the fusion IS row-vectorizable
// In this case, the block count is changed from 7813 to 2000.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000}
Expand Down Expand Up @@ -260,7 +260,7 @@ ENTRY main {
// - the fusion is not row-vectorizable
// In this case, the block count is changed from 7813 to 1008.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
Expand Down Expand Up @@ -300,7 +300,7 @@ ENTRY main {
// This tests the GELU kernel. The original kernel that
// motivated few_waves implementation.

// CHECK-LABEL: define void @fusion
// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
Expand Down
2 changes: 1 addition & 1 deletion xla/service/gpu/tests/reduce_atomic_min.hlo
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ENTRY reduce.1 {
// CHECK: wrapped_reduce.in_bounds-after: ; preds = %[[VAL_9]], %[[VAL_11:.*]]
// CHECK: ret void
// CHECK: wrapped_reduce.in_bounds-true: ; preds = %[[VAL_11]]
// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load !5
// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load
// CHECK: store float %[[VAL_12]], ptr %[[VAL_14:.*]], align 4
// CHECK: br label %[[VAL_10]]
// CHECK: entry:
Expand Down
119 changes: 66 additions & 53 deletions xla/service/gpu/tests/reduce_column_layout_change.hlo
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ ENTRY kernel_entry {
// CHECK: ret void
// CHECK: reduce-group-0-true: ; preds = %[[VAL_20]]
// CHECK: %[[VAL_21:.*]] = load float, ptr @0, align 4
// CHECK: store float %[[VAL_21]], ptr %[[VAL_13]], align 4
// CHECK: store float %[[VAL_21]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
// CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
// CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
// CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
Expand All @@ -62,71 +62,84 @@ ENTRY kernel_entry {
// CHECK: %tile_origin.0 = mul i32 %[[VAL_27]], 1
// CHECK: %tile_origin.1 = mul i32 %[[VAL_26]], 4096
// CHECK: %tile_origin.2 = mul i32 %[[VAL_24]], 32
// CHECK: store i32 %thread.id.1, ptr %[[VAL_12]], align 4
// CHECK: store i32 %thread.id.1, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
// CHECK: br label %[[VAL_29:.*]]
// CHECK: loop1.loop_header: ; preds = %[[VAL_30:.*]], %[[VAL_17]]
// CHECK: %[[VAL_31:.*]] = load i32, ptr %[[VAL_12]], align 4
// CHECK: %[[VAL_31:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
// CHECK: %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1
// CHECK: br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]]
// CHECK: loop1.loop_body: ; preds = %[[VAL_29]]
// CHECK: %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32
// CHECK: store i32 %[[VAL_35]], ptr %[[VAL_12]], align 4
// CHECK: store i32 0, ptr %[[VAL_11]], align 4
// CHECK: store i32 %[[VAL_35]], ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
// CHECK: store i32 0, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
// CHECK: br label %[[VAL_37:.*]]
// CHECK: loop2.loop_header: ; preds = %[[VAL_38:.*]], %[[VAL_34]]
// CHECK: %[[VAL_39:.*]] = load i32, ptr %[[VAL_11]], align 4
// CHECK: %[[VAL_39:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
// CHECK: %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32
// CHECK: br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]]
// CHECK: loop2.loop_body: ; preds = %[[VAL_37]]
// CHECK: %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32
// CHECK: store i32 %[[VAL_42]], ptr %[[VAL_11]], align 4
// CHECK: store i32 %[[VAL_42]], ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
// CHECK: %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2
// CHECK: %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32
// CHECK: br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]]
// CHECK: x_in_tile-after: ; preds = %[[VAL_46]], %[[VAL_41]]
// CHECK: br label %[[VAL_37]], !llvm.loop !5
// CHECK: br label %[[VAL_37]], !llvm.loop
// CHECK: loop2.loop_exit: ; preds = %[[VAL_37]]
// CHECK: br label %[[VAL_29]], !llvm.loop !8
// CHECK: br label %[[VAL_29]], !llvm.loop
// CHECK: loop1.loop_exit: ; preds = %[[VAL_29]]
// CHECK: %[[VAL_47:.*]] = load float, ptr %[[VAL_13]], align 4
// CHECK: %[[VAL_47:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
// CHECK: %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1
// CHECK: %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr
// CHECK: store float %[[VAL_47]], ptr %[[VAL_49]], align 4
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK-PTX: call void @llvm.nvvm.barrier0()
// CHECK-GCN: call void @llvm.amdgcn.s.barrier()
// CHECK: %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2
// CHECK: %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr
// CHECK: %[[VAL_52:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31)
// CHECK: store float %[[VAL_53]], ptr %[[VAL_9]], align 4
// CHECK: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
// CHECK: %[[VAL_54:.*]] = load float, ptr %[[VAL_8]], align 4
// CHECK-PTX: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31)
// CHECK-GCN: %[[VAL_53_:.*]] = call i32 @__ockl_readuplane_i32
// CHECK-GCN: %[[VAL_53:.*]] = bitcast i32
// CHECK: store float %[[VAL_53]], ptr{{( addrspace\(5\))?}} %[[VAL_9]], align 4
// CHECK-PTX: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
// CHECK: %[[VAL_54:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_8]], align 4
// CHECK: store float %[[VAL_54]], ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_55:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31)
// CHECK: store float %[[VAL_56]], ptr %[[VAL_7]], align 4
// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
// CHECK: %[[VAL_57:.*]] = load float, ptr %[[VAL_6]], align 4
// CHECK: store float %[[VAL_57]], ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_58:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31)
// CHECK: store float %[[VAL_59]], ptr %[[VAL_5]], align 4
// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
// CHECK: %[[VAL_60:.*]] = load float, ptr %[[VAL_4]], align 4
// CHECK: store float %[[VAL_60]], ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_61:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31)
// CHECK: store float %[[VAL_62]], ptr %[[VAL_3]], align 4
// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
// CHECK: %[[VAL_63:.*]] = load float, ptr %[[VAL_2]], align 4
// CHECK: store float %[[VAL_63]], ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_64:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31)
// CHECK: store float %[[VAL_65]], ptr %[[VAL_1]], align 4
// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
// CHECK: %[[VAL_66:.*]] = load float, ptr %[[VAL_0]], align 4
// CHECK: store float %[[VAL_66]], ptr %[[VAL_51]], align 4
// CHECK: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
// CHECK: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
// CHECK: %[[VAL_55:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK-PTX: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31)
// CHECK-GCN: %[[VAL_56_1_:.*]] = call i32 @__ockl_readuplane_i32
// CHECK-GCN: %[[VAL_56:.*]] = bitcast i32
// CHECK: store float %[[VAL_56]], ptr{{( addrspace\(5\))?}} %[[VAL_7]], align 4
// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
// CHECK: %[[VAL_57:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_6]], align 4
// CHECK: store float %[[VAL_57]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK: %[[VAL_58:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK-PTX: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31)
// CHECK-GCN: %[[VAL_59_:.*]] = call i32 @__ockl_readuplane_i32
// CHECK-GCN: %[[VAL_59:.*]] = bitcast i32
// CHECK: store float %[[VAL_59]], ptr{{( addrspace\(5\))?}} %[[VAL_5]], align 4
// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
// CHECK: %[[VAL_60:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_4]], align 4
// CHECK: store float %[[VAL_60]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK: %[[VAL_61:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK-PTX: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31)
// CHECK-GCN: %[[VAL_62_:.*]] = call i32 @__ockl_readuplane_i32
// CHECK-GCN: %[[VAL_62:.*]] = bitcast i32
// CHECK: store float %[[VAL_62]], ptr{{( addrspace\(5\))?}} %[[VAL_3]], align 4
// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
// CHECK: %[[VAL_63:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_2]], align 4
// CHECK: store float %[[VAL_63]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK: %[[VAL_64:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK-PTX: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31)
// CHECK-GCN: %[[VAL_65_:.*]] = call i32 @__ockl_readuplane_i32
// CHECK-GCN: %[[VAL_65:.*]] = bitcast i32
// CHECK: store float %[[VAL_65]], ptr{{( addrspace\(5\))?}} %[[VAL_1]], align 4
// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
// CHECK: %[[VAL_66:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_0]], align 4
// CHECK: store float %[[VAL_66]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK-PTX: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
// CHECK-PTX: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
// CHECK-GCN: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
// CHECK-GCN: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
// CHECK: %[[VAL_69:.*]] = and i1 %[[VAL_67]], %[[VAL_68]]
// CHECK: %[[VAL_70:.*]] = icmp eq i32 %lane_id, 0
// CHECK: %[[VAL_71:.*]] = and i1 %[[VAL_69]], %[[VAL_70]]
Expand Down Expand Up @@ -158,11 +171,11 @@ ENTRY kernel_entry {
// CHECK: %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_73]], 1
// CHECK: %[[VAL_95:.*]] = add nuw nsw i32 0, %[[VAL_94]]
// CHECK: %[[VAL_96:.*]] = getelementptr inbounds [12 x [3 x [32 x [16 x [32 x [4 x [3 x [12 x float]]]]]]]], ptr %[[VAL_97:.*]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_91]], i32 %[[VAL_89]], i32 %[[VAL_85]], i32 %[[VAL_84]], i32 %[[VAL_82]], i32 %[[VAL_80]], i32 %[[VAL_78]]
// CHECK: %[[VAL_98:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load !9
// CHECK: store float %[[VAL_98]], ptr %[[VAL_14]], align 4
// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
// CHECK: %[[VAL_99:.*]] = load float, ptr %[[VAL_10]], align 4
// CHECK: store float %[[VAL_99]], ptr %[[VAL_13]], align 4
// CHECK: %[[VAL_98:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_96]], align 4, !invariant.load
// CHECK: store float %[[VAL_98]], ptr{{( addrspace\(5\))?}} %[[VAL_14]], align 4
// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
// CHECK: %[[VAL_99:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_10]], align 4
// CHECK: store float %[[VAL_99]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
// CHECK: br label %[[VAL_38]]
// CHECK: reduction_write_output-true: ; preds = %[[VAL_33]]
// CHECK: %[[VAL_100:.*]] = add i32 %tile_origin.2, %thread.id.1
Expand All @@ -180,15 +193,15 @@ ENTRY kernel_entry {
// CHECK: %[[VAL_112:.*]] = mul nuw nsw i32 %tile_origin.0, 1
// CHECK: %[[VAL_113:.*]] = add nuw nsw i32 0, %[[VAL_112]]
// CHECK: %[[VAL_114:.*]] = getelementptr inbounds [12 x [16 x [4 x [3 x [32 x float]]]]], ptr %[[VAL_115:.*]], i32 0, i32 %[[VAL_103]], i32 %[[VAL_110]], i32 %[[VAL_107]], i32 %[[VAL_105]], i32 %[[VAL_109]]
// CHECK: %[[VAL_116:.*]] = load float, ptr %[[VAL_51]], align 4
// CHECK: store float %[[VAL_116]], ptr %[[VAL_114]], align 4
// CHECK: %[[VAL_116:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
// CHECK: store float %[[VAL_116]], ptr{{( addrspace\(5\))?}} %[[VAL_114]], align 4
// CHECK: br label %[[VAL_19]]
// CHECK: entry:
// CHECK: %[[VAL_117:.*]] = alloca float, align 4
// CHECK: %[[VAL_118:.*]] = load float, ptr %[[VAL_119:.*]], align 4
// CHECK: %[[VAL_120:.*]] = load float, ptr %[[VAL_121:.*]], align 4
// CHECK: %[[VAL_118:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_119:.*]], align 4
// CHECK: %[[VAL_120:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_121:.*]], align 4
// CHECK: %[[VAL_122:.*]] = fadd float %[[VAL_118]], %[[VAL_120]]
// CHECK: store float %[[VAL_122]], ptr %[[VAL_117]], align 4
// CHECK: %[[VAL_123:.*]] = load float, ptr %[[VAL_117]], align 4
// CHECK: store float %[[VAL_123]], ptr %[[VAL_124:.*]], align 4
// CHECK: store float %[[VAL_122]], ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4
// CHECK: %[[VAL_123:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4
// CHECK: store float %[[VAL_123]], ptr{{( addrspace\(5\))?}} %[[VAL_124:.*]], align 4
// CHECK: ret void

0 comments on commit c09263e

Please sign in to comment.