From c09263e7e8abc8997be280bbd2c3c282d1f74c33 Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Tue, 16 Apr 2024 10:25:50 +0000 Subject: [PATCH] ROCm FileCheck pattern fixes --- xla/service/gpu/tests/add_preds.hlo | 2 +- xla/service/gpu/tests/fused_scatter.hlo | 2 +- xla/service/gpu/tests/launch_dimensions.hlo | 20 +-- xla/service/gpu/tests/reduce_atomic_min.hlo | 2 +- .../gpu/tests/reduce_column_layout_change.hlo | 119 ++++++++++-------- 5 files changed, 79 insertions(+), 66 deletions(-) diff --git a/xla/service/gpu/tests/add_preds.hlo b/xla/service/gpu/tests/add_preds.hlo index 120b6a5ad686b..d86113ae2ad60 100644 --- a/xla/service/gpu/tests/add_preds.hlo +++ b/xla/service/gpu/tests/add_preds.hlo @@ -1,6 +1,6 @@ // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s -// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]], +// CHECK: define{{( amdgpu_kernel)?}} void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]], // CHECK: %[[A:.*]] = load {{.*}} ptr %[[ARG0]] // CHECK: %[[B:.*]] = load {{.*}} ptr %[[ARG1]] // CHECK: or {{.*}} %[[A]], %[[B]] diff --git a/xla/service/gpu/tests/fused_scatter.hlo b/xla/service/gpu/tests/fused_scatter.hlo index 9a30436ebfa38..f8cb266bc4c67 100644 --- a/xla/service/gpu/tests/fused_scatter.hlo +++ b/xla/service/gpu/tests/fused_scatter.hlo @@ -2,7 +2,7 @@ // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -// CHECK: define void @wrapped_scatter +// CHECK: define{{( amdgpu_kernel)?}} void @wrapped_scatter // CHECK: %[[VAL_70:.*]] = alloca i32, align 4 // CHECK-PTX: %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x // CHECK-GCN: %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x diff --git a/xla/service/gpu/tests/launch_dimensions.hlo b/xla/service/gpu/tests/launch_dimensions.hlo index bcfa37733f7e6..3d05dcf9892ad 100644 --- a/xla/service/gpu/tests/launch_dimensions.hlo +++ b/xla/service/gpu/tests/launch_dimensions.hlo @@ -2,7 +2,7 @@ // This tests that we do not increase the grid launch size when // few_waves is enabled. -// CHECK-LABEL: define void @wrapped_b +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] @@ -27,7 +27,7 @@ ENTRY main { // This tests that we cap grid launch code when few_waves is enabled. -// CHECK-LABEL: define void @wrapped_b +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @wrapped_b // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] @@ -53,7 +53,7 @@ ENTRY main { // This tests that we cap grid launch code when few_waves is enabled // and scalar broadcast are present. -// CHECK-LABEL: define void @fusion_3 +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion_3 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -84,7 +84,7 @@ ENTRY main { // This tests that we enable few_waves in a simple fusion. It is the baseline // for the tests below. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -113,7 +113,7 @@ ENTRY main { // This tests that we keep few_waves enabled for large constants. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -141,7 +141,7 @@ ENTRY main { // This tests that we disable few_waves if a non-elementwise op is present. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313} @@ -175,7 +175,7 @@ ENTRY main { // - the fusion is not row-vectorizable // It serves as a baseline for the tests below. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813} @@ -219,7 +219,7 @@ ENTRY main { // - the fusion IS row-vectorizable // In this case, the block count is changed from 7813 to 2000. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000} @@ -260,7 +260,7 @@ ENTRY main { // - the fusion is not row-vectorizable // In this case, the block count is changed from 7813 to 1008. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008} @@ -300,7 +300,7 @@ ENTRY main { // This tests the GELU kernel. The original kernel that // motivated few_waves implementation. -// CHECK-LABEL: define void @fusion +// CHECK-LABEL: define{{( amdgpu_kernel)?}} void @fusion // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]] // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]] diff --git a/xla/service/gpu/tests/reduce_atomic_min.hlo b/xla/service/gpu/tests/reduce_atomic_min.hlo index 0b429ac134b48..46e1240f947f8 100644 --- a/xla/service/gpu/tests/reduce_atomic_min.hlo +++ b/xla/service/gpu/tests/reduce_atomic_min.hlo @@ -39,7 +39,7 @@ ENTRY reduce.1 { // CHECK: wrapped_reduce.in_bounds-after: ; preds = %[[VAL_9]], %[[VAL_11:.*]] // CHECK: ret void // CHECK: wrapped_reduce.in_bounds-true: ; preds = %[[VAL_11]] -// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load !5 +// CHECK: %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load // CHECK: store float %[[VAL_12]], ptr %[[VAL_14:.*]], align 4 // CHECK: br label %[[VAL_10]] // CHECK: entry: diff --git a/xla/service/gpu/tests/reduce_column_layout_change.hlo b/xla/service/gpu/tests/reduce_column_layout_change.hlo index cb30643886de4..122929f3df280 100644 --- a/xla/service/gpu/tests/reduce_column_layout_change.hlo +++ b/xla/service/gpu/tests/reduce_column_layout_change.hlo @@ -43,7 +43,7 @@ ENTRY kernel_entry { // CHECK: ret void // CHECK: reduce-group-0-true: ; preds = %[[VAL_20]] // CHECK: %[[VAL_21:.*]] = load float, ptr @0, align 4 -// CHECK: store float %[[VAL_21]], ptr %[[VAL_13]], align 4 +// CHECK: store float %[[VAL_21]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK-PTX: %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3 // CHECK-GCN: %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x // CHECK-PTX: %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4 @@ -62,71 +62,84 @@ ENTRY kernel_entry { // CHECK: %tile_origin.0 = mul i32 %[[VAL_27]], 1 // CHECK: %tile_origin.1 = mul i32 %[[VAL_26]], 4096 // CHECK: %tile_origin.2 = mul i32 %[[VAL_24]], 32 -// CHECK: store i32 %thread.id.1, ptr %[[VAL_12]], align 4 +// CHECK: store i32 %thread.id.1, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 // CHECK: br label %[[VAL_29:.*]] // CHECK: loop1.loop_header: ; preds = %[[VAL_30:.*]], %[[VAL_17]] -// CHECK: %[[VAL_31:.*]] = load i32, ptr %[[VAL_12]], align 4 +// CHECK: %[[VAL_31:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 // CHECK: %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1 // CHECK: br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]] // CHECK: loop1.loop_body: ; preds = %[[VAL_29]] // CHECK: %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32 -// CHECK: store i32 %[[VAL_35]], ptr %[[VAL_12]], align 4 -// CHECK: store i32 0, ptr %[[VAL_11]], align 4 +// CHECK: store i32 %[[VAL_35]], ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4 +// CHECK: store i32 0, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: br label %[[VAL_37:.*]] // CHECK: loop2.loop_header: ; preds = %[[VAL_38:.*]], %[[VAL_34]] -// CHECK: %[[VAL_39:.*]] = load i32, ptr %[[VAL_11]], align 4 +// CHECK: %[[VAL_39:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32 // CHECK: br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]] // CHECK: loop2.loop_body: ; preds = %[[VAL_37]] // CHECK: %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32 -// CHECK: store i32 %[[VAL_42]], ptr %[[VAL_11]], align 4 +// CHECK: store i32 %[[VAL_42]], ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4 // CHECK: %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2 // CHECK: %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32 // CHECK: br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]] // CHECK: x_in_tile-after: ; preds = %[[VAL_46]], %[[VAL_41]] -// CHECK: br label %[[VAL_37]], !llvm.loop !5 +// CHECK: br label %[[VAL_37]], !llvm.loop // CHECK: loop2.loop_exit: ; preds = %[[VAL_37]] -// CHECK: br label %[[VAL_29]], !llvm.loop !8 +// CHECK: br label %[[VAL_29]], !llvm.loop // CHECK: loop1.loop_exit: ; preds = %[[VAL_29]] -// CHECK: %[[VAL_47:.*]] = load float, ptr %[[VAL_13]], align 4 +// CHECK: %[[VAL_47:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK: %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1 // CHECK: %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr // CHECK: store float %[[VAL_47]], ptr %[[VAL_49]], align 4 -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-PTX: call void @llvm.nvvm.barrier0() +// CHECK-GCN: call void @llvm.amdgcn.s.barrier() // CHECK: %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2 // CHECK: %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr // CHECK: %[[VAL_52:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31) -// CHECK: store float %[[VAL_53]], ptr %[[VAL_9]], align 4 -// CHECK: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]]) -// CHECK: %[[VAL_54:.*]] = load float, ptr %[[VAL_8]], align 4 +// CHECK-PTX: %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31) +// CHECK-GCN: %[[VAL_53_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_53:.*]] = bitcast i32 +// CHECK: store float %[[VAL_53]], ptr{{( addrspace\(5\))?}} %[[VAL_9]], align 4 +// CHECK-PTX: call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]]) +// CHECK: %[[VAL_54:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_8]], align 4 // CHECK: store float %[[VAL_54]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_55:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31) -// CHECK: store float %[[VAL_56]], ptr %[[VAL_7]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]]) -// CHECK: %[[VAL_57:.*]] = load float, ptr %[[VAL_6]], align 4 -// CHECK: store float %[[VAL_57]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_58:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31) -// CHECK: store float %[[VAL_59]], ptr %[[VAL_5]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]]) -// CHECK: %[[VAL_60:.*]] = load float, ptr %[[VAL_4]], align 4 -// CHECK: store float %[[VAL_60]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_61:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31) -// CHECK: store float %[[VAL_62]], ptr %[[VAL_3]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]]) -// CHECK: %[[VAL_63:.*]] = load float, ptr %[[VAL_2]], align 4 -// CHECK: store float %[[VAL_63]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_64:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31) -// CHECK: store float %[[VAL_65]], ptr %[[VAL_1]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]]) -// CHECK: %[[VAL_66:.*]] = load float, ptr %[[VAL_0]], align 4 -// CHECK: store float %[[VAL_66]], ptr %[[VAL_51]], align 4 -// CHECK: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 -// CHECK: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK: %[[VAL_55:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31) +// CHECK-GCN: %[[VAL_56_1_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_56:.*]] = bitcast i32 +// CHECK: store float %[[VAL_56]], ptr{{( addrspace\(5\))?}} %[[VAL_7]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]]) +// CHECK: %[[VAL_57:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_6]], align 4 +// CHECK: store float %[[VAL_57]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_58:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31) +// CHECK-GCN: %[[VAL_59_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_59:.*]] = bitcast i32 +// CHECK: store float %[[VAL_59]], ptr{{( addrspace\(5\))?}} %[[VAL_5]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]]) +// CHECK: %[[VAL_60:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_4]], align 4 +// CHECK: store float %[[VAL_60]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_61:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31) +// CHECK-GCN: %[[VAL_62_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_62:.*]] = bitcast i32 +// CHECK: store float %[[VAL_62]], ptr{{( addrspace\(5\))?}} %[[VAL_3]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]]) +// CHECK: %[[VAL_63:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_2]], align 4 +// CHECK: store float %[[VAL_63]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: %[[VAL_64:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31) +// CHECK-GCN: %[[VAL_65_:.*]] = call i32 @__ockl_readuplane_i32 +// CHECK-GCN: %[[VAL_65:.*]] = bitcast i32 +// CHECK: store float %[[VAL_65]], ptr{{( addrspace\(5\))?}} %[[VAL_1]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]]) +// CHECK: %[[VAL_66:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_0]], align 4 +// CHECK: store float %[[VAL_66]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK-PTX: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 +// CHECK-PTX: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK-GCN: %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1 +// CHECK-GCN: %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32 // CHECK: %[[VAL_69:.*]] = and i1 %[[VAL_67]], %[[VAL_68]] // CHECK: %[[VAL_70:.*]] = icmp eq i32 %lane_id, 0 // CHECK: %[[VAL_71:.*]] = and i1 %[[VAL_69]], %[[VAL_70]] @@ -158,11 +171,11 @@ ENTRY kernel_entry { // CHECK: %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_73]], 1 // CHECK: %[[VAL_95:.*]] = add nuw nsw i32 0, %[[VAL_94]] // CHECK: %[[VAL_96:.*]] = getelementptr inbounds [12 x [3 x [32 x [16 x [32 x [4 x [3 x [12 x float]]]]]]]], ptr %[[VAL_97:.*]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_91]], i32 %[[VAL_89]], i32 %[[VAL_85]], i32 %[[VAL_84]], i32 %[[VAL_82]], i32 %[[VAL_80]], i32 %[[VAL_78]] -// CHECK: %[[VAL_98:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load !9 -// CHECK: store float %[[VAL_98]], ptr %[[VAL_14]], align 4 -// CHECK: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]]) -// CHECK: %[[VAL_99:.*]] = load float, ptr %[[VAL_10]], align 4 -// CHECK: store float %[[VAL_99]], ptr %[[VAL_13]], align 4 +// CHECK: %[[VAL_98:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_96]], align 4, !invariant.load +// CHECK: store float %[[VAL_98]], ptr{{( addrspace\(5\))?}} %[[VAL_14]], align 4 +// CHECK-PTX: call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]]) +// CHECK: %[[VAL_99:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_10]], align 4 +// CHECK: store float %[[VAL_99]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4 // CHECK: br label %[[VAL_38]] // CHECK: reduction_write_output-true: ; preds = %[[VAL_33]] // CHECK: %[[VAL_100:.*]] = add i32 %tile_origin.2, %thread.id.1 @@ -180,15 +193,15 @@ ENTRY kernel_entry { // CHECK: %[[VAL_112:.*]] = mul nuw nsw i32 %tile_origin.0, 1 // CHECK: %[[VAL_113:.*]] = add nuw nsw i32 0, %[[VAL_112]] // CHECK: %[[VAL_114:.*]] = getelementptr inbounds [12 x [16 x [4 x [3 x [32 x float]]]]], ptr %[[VAL_115:.*]], i32 0, i32 %[[VAL_103]], i32 %[[VAL_110]], i32 %[[VAL_107]], i32 %[[VAL_105]], i32 %[[VAL_109]] -// CHECK: %[[VAL_116:.*]] = load float, ptr %[[VAL_51]], align 4 -// CHECK: store float %[[VAL_116]], ptr %[[VAL_114]], align 4 +// CHECK: %[[VAL_116:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4 +// CHECK: store float %[[VAL_116]], ptr{{( addrspace\(5\))?}} %[[VAL_114]], align 4 // CHECK: br label %[[VAL_19]] // CHECK: entry: // CHECK: %[[VAL_117:.*]] = alloca float, align 4 -// CHECK: %[[VAL_118:.*]] = load float, ptr %[[VAL_119:.*]], align 4 -// CHECK: %[[VAL_120:.*]] = load float, ptr %[[VAL_121:.*]], align 4 +// CHECK: %[[VAL_118:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_119:.*]], align 4 +// CHECK: %[[VAL_120:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_121:.*]], align 4 // CHECK: %[[VAL_122:.*]] = fadd float %[[VAL_118]], %[[VAL_120]] -// CHECK: store float %[[VAL_122]], ptr %[[VAL_117]], align 4 -// CHECK: %[[VAL_123:.*]] = load float, ptr %[[VAL_117]], align 4 -// CHECK: store float %[[VAL_123]], ptr %[[VAL_124:.*]], align 4 +// CHECK: store float %[[VAL_122]], ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4 +// CHECK: %[[VAL_123:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4 +// CHECK: store float %[[VAL_123]], ptr{{( addrspace\(5\))?}} %[[VAL_124:.*]], align 4 // CHECK: ret void